diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index dd521257ccb2..5e8f282e96c3 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -1,9905 +1,9906 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Nexenta Systems, Inc.
  * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
  * Copyright (c) 2015, 2017, Intel Corporation.
  * Copyright (c) 2020 Datto Inc.
  * Copyright (c) 2020, The FreeBSD Foundation [1]
  *
  * [1] Portions of this software were developed by Allan Jude
  *     under sponsorship from the FreeBSD Foundation.
  * Copyright (c) 2021 Allan Jude
  * Copyright (c) 2021 Toomas Soome <tsoome@me.com>
  * Copyright (c) 2023, 2024, Klara Inc.
  * Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
  */
 
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <getopt.h>
 #include <openssl/evp.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/zap_impl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_sa.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/dbuf.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/stat.h>
 #include <sys/resource.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_traverse.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/zfs_fuid.h>
 #include <sys/arc.h>
 #include <sys/arc_impl.h>
 #include <sys/ddt.h>
 #include <sys/ddt_impl.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/blkptr.h>
 #include <sys/dsl_crypt.h>
 #include <sys/dsl_scan.h>
 #include <sys/btree.h>
 #include <sys/brt.h>
 #include <sys/brt_impl.h>
 #include <zfs_comutil.h>
 #include <sys/zstd/zstd.h>
 #include <sys/backtrace.h>
 
 #include <libnvpair.h>
 #include <libzutil.h>
 #include <libzfs_core.h>
 
 #include <libzdb.h>
 
 #include "zdb.h"
 
 
 extern int reference_tracking_enable;
 extern int zfs_recover;
 extern uint_t zfs_vdev_async_read_max_active;
 extern boolean_t spa_load_verify_dryrun;
 extern boolean_t spa_mode_readable_spacemaps;
 extern uint_t zfs_reconstruct_indirect_combinations_max;
 extern uint_t zfs_btree_verify_intensity;
 
 static const char cmdname[] = "zdb";
 uint8_t dump_opt[256];
 
 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
 
 static uint64_t *zopt_metaslab = NULL;
 static unsigned zopt_metaslab_args = 0;
 
 
 static zopt_object_range_t *zopt_object_ranges = NULL;
 static unsigned zopt_object_args = 0;
 
 static int flagbits[256];
 
 
 static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */
 static int leaked_objects = 0;
 static zfs_range_tree_t *mos_refd_objs;
 static spa_t *spa;
 static objset_t *os;
 static boolean_t kernel_init_done;
 
 static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
     boolean_t);
 static void mos_obj_refd(uint64_t);
 static void mos_obj_refd_multiple(uint64_t);
 static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
     dmu_tx_t *tx);
 
 
 
 static void zdb_print_blkptr(const blkptr_t *bp, int flags);
 static void zdb_exit(int reason);
 
 typedef struct sublivelist_verify_block_refcnt {
 	/* block pointer entry in livelist being verified */
 	blkptr_t svbr_blk;
 
 	/*
 	 * Refcount gets incremented to 1 when we encounter the first
 	 * FREE entry for the svfbr block pointer and a node for it
 	 * is created in our ZDB verification/tracking metadata.
 	 *
 	 * As we encounter more FREE entries we increment this counter
 	 * and similarly decrement it whenever we find the respective
 	 * ALLOC entries for this block.
 	 *
 	 * When the refcount gets to 0 it means that all the FREE and
 	 * ALLOC entries of this block have paired up and we no longer
 	 * need to track it in our verification logic (e.g. the node
 	 * containing this struct in our verification data structure
 	 * should be freed).
 	 *
 	 * [refer to sublivelist_verify_blkptr() for the actual code]
 	 */
 	uint32_t svbr_refcnt;
 } sublivelist_verify_block_refcnt_t;
 
 static int
 sublivelist_block_refcnt_compare(const void *larg, const void *rarg)
 {
 	const sublivelist_verify_block_refcnt_t *l = larg;
 	const sublivelist_verify_block_refcnt_t *r = rarg;
 	return (livelist_compare(&l->svbr_blk, &r->svbr_blk));
 }
 
 static int
 sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
     dmu_tx_t *tx)
 {
 	ASSERT3P(tx, ==, NULL);
 	struct sublivelist_verify *sv = arg;
 	sublivelist_verify_block_refcnt_t current = {
 			.svbr_blk = *bp,
 
 			/*
 			 * Start with 1 in case this is the first free entry.
 			 * This field is not used for our B-Tree comparisons
 			 * anyway.
 			 */
 			.svbr_refcnt = 1,
 	};
 
 	zfs_btree_index_t where;
 	sublivelist_verify_block_refcnt_t *pair =
 	    zfs_btree_find(&sv->sv_pair, &current, &where);
 	if (free) {
 		if (pair == NULL) {
 			/* first free entry for this block pointer */
 			zfs_btree_add(&sv->sv_pair, &current);
 		} else {
 			pair->svbr_refcnt++;
 		}
 	} else {
 		if (pair == NULL) {
 			/* block that is currently marked as allocated */
 			for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
 				if (DVA_IS_EMPTY(&bp->blk_dva[i]))
 					break;
 				sublivelist_verify_block_t svb = {
 				    .svb_dva = bp->blk_dva[i],
 				    .svb_allocated_txg =
 				    BP_GET_LOGICAL_BIRTH(bp)
 				};
 
 				if (zfs_btree_find(&sv->sv_leftover, &svb,
 				    &where) == NULL) {
 					zfs_btree_add_idx(&sv->sv_leftover,
 					    &svb, &where);
 				}
 			}
 		} else {
 			/* alloc matches a free entry */
 			pair->svbr_refcnt--;
 			if (pair->svbr_refcnt == 0) {
 				/* all allocs and frees have been matched */
 				zfs_btree_remove_idx(&sv->sv_pair, &where);
 			}
 		}
 	}
 
 	return (0);
 }
 
 static int
 sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
 {
 	int err;
 	struct sublivelist_verify *sv = args;
 
 	zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL,
 	    sizeof (sublivelist_verify_block_refcnt_t));
 
 	err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
 	    sv, NULL);
 
 	sublivelist_verify_block_refcnt_t *e;
 	zfs_btree_index_t *cookie = NULL;
 	while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
 		    &e->svbr_blk, B_TRUE);
 		(void) printf("\tERROR: %d unmatched FREE(s): %s\n",
 		    e->svbr_refcnt, blkbuf);
 	}
 	zfs_btree_destroy(&sv->sv_pair);
 
 	return (err);
 }
 
 static int
 livelist_block_compare(const void *larg, const void *rarg)
 {
 	const sublivelist_verify_block_t *l = larg;
 	const sublivelist_verify_block_t *r = rarg;
 
 	if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva))
 		return (-1);
 	else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva))
 		return (+1);
 
 	if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva))
 		return (-1);
 	else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva))
 		return (+1);
 
 	if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva))
 		return (-1);
 	else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva))
 		return (+1);
 
 	return (0);
 }
 
 /*
  * Check for errors in a livelist while tracking all unfreed ALLOCs in the
  * sublivelist_verify_t: sv->sv_leftover
  */
 static void
 livelist_verify(dsl_deadlist_t *dl, void *arg)
 {
 	sublivelist_verify_t *sv = arg;
 	dsl_deadlist_iterate(dl, sublivelist_verify_func, sv);
 }
 
 /*
  * Check for errors in the livelist entry and discard the intermediary
  * data structures
  */
 static int
 sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle)
 {
 	(void) args;
 	sublivelist_verify_t sv;
 	zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
 	    sizeof (sublivelist_verify_block_t));
 	int err = sublivelist_verify_func(&sv, dle);
 	zfs_btree_clear(&sv.sv_leftover);
 	zfs_btree_destroy(&sv.sv_leftover);
 	return (err);
 }
 
 typedef struct metaslab_verify {
 	/*
 	 * Tree containing all the leftover ALLOCs from the livelists
 	 * that are part of this metaslab.
 	 */
 	zfs_btree_t mv_livelist_allocs;
 
 	/*
 	 * Metaslab information.
 	 */
 	uint64_t mv_vdid;
 	uint64_t mv_msid;
 	uint64_t mv_start;
 	uint64_t mv_end;
 
 	/*
 	 * What's currently allocated for this metaslab.
 	 */
 	zfs_range_tree_t *mv_allocated;
 } metaslab_verify_t;
 
 typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);
 
 typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg,
     void *arg);
 
 typedef struct unflushed_iter_cb_arg {
 	spa_t *uic_spa;
 	uint64_t uic_txg;
 	void *uic_arg;
 	zdb_log_sm_cb_t uic_cb;
 } unflushed_iter_cb_arg_t;
 
 static int
 iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)
 {
 	unflushed_iter_cb_arg_t *uic = arg;
 	return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));
 }
 
 static void
 iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
 		space_map_t *sm = NULL;
 		VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
 		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
 
 		unflushed_iter_cb_arg_t uic = {
 			.uic_spa = spa,
 			.uic_txg = sls->sls_txg,
 			.uic_arg = arg,
 			.uic_cb = cb
 		};
 		VERIFY0(space_map_iterate(sm, space_map_length(sm),
 		    iterate_through_spacemap_logs_cb, &uic));
 		space_map_close(sm);
 	}
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 static void
 verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg,
     uint64_t offset, uint64_t size)
 {
 	sublivelist_verify_block_t svb = {{{0}}};
 	DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid);
 	DVA_SET_OFFSET(&svb.svb_dva, offset);
 	DVA_SET_ASIZE(&svb.svb_dva, size);
 	zfs_btree_index_t where;
 	uint64_t end_offset = offset + size;
 
 	/*
 	 *  Look for an exact match for spacemap entry in the livelist entries.
 	 *  Then, look for other livelist entries that fall within the range
 	 *  of the spacemap entry as it may have been condensed
 	 */
 	sublivelist_verify_block_t *found =
 	    zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where);
 	if (found == NULL) {
 		found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where);
 	}
 	for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid &&
 	    DVA_GET_OFFSET(&found->svb_dva) < end_offset;
 	    found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
 		if (found->svb_allocated_txg <= txg) {
 			(void) printf("ERROR: Livelist ALLOC [%llx:%llx] "
 			    "from TXG %llx FREED at TXG %llx\n",
 			    (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva),
 			    (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva),
 			    (u_longlong_t)found->svb_allocated_txg,
 			    (u_longlong_t)txg);
 		}
 	}
 }
 
 static int
 metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg)
 {
 	metaslab_verify_t *mv = arg;
 	uint64_t offset = sme->sme_offset;
 	uint64_t size = sme->sme_run;
 	uint64_t txg = sme->sme_txg;
 
 	if (sme->sme_type == SM_ALLOC) {
 		if (zfs_range_tree_contains(mv->mv_allocated,
 		    offset, size)) {
 			(void) printf("ERROR: DOUBLE ALLOC: "
 			    "%llu [%llx:%llx] "
 			    "%llu:%llu LOG_SM\n",
 			    (u_longlong_t)txg, (u_longlong_t)offset,
 			    (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
 			    (u_longlong_t)mv->mv_msid);
 		} else {
 			zfs_range_tree_add(mv->mv_allocated,
 			    offset, size);
 		}
 	} else {
 		if (!zfs_range_tree_contains(mv->mv_allocated,
 		    offset, size)) {
 			(void) printf("ERROR: DOUBLE FREE: "
 			    "%llu [%llx:%llx] "
 			    "%llu:%llu LOG_SM\n",
 			    (u_longlong_t)txg, (u_longlong_t)offset,
 			    (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
 			    (u_longlong_t)mv->mv_msid);
 		} else {
 			zfs_range_tree_remove(mv->mv_allocated,
 			    offset, size);
 		}
 	}
 
 	if (sme->sme_type != SM_ALLOC) {
 		/*
 		 * If something is freed in the spacemap, verify that
 		 * it is not listed as allocated in the livelist.
 		 */
 		verify_livelist_allocs(mv, txg, offset, size);
 	}
 	return (0);
 }
 
 static int
 spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme,
     uint64_t txg, void *arg)
 {
 	metaslab_verify_t *mv = arg;
 	uint64_t offset = sme->sme_offset;
 	uint64_t vdev_id = sme->sme_vdev;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 
 	/* skip indirect vdevs */
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	if (vdev_id != mv->mv_vdid)
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	if (ms->ms_id != mv->mv_msid)
 		return (0);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 
 	ASSERT3U(txg, ==, sme->sme_txg);
 	return (metaslab_spacemap_validation_cb(sme, mv));
 }
 
 static void
 spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv)
 {
 	iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv);
 }
 
 static void
 spacemap_check_ms_sm(space_map_t  *sm, metaslab_verify_t *mv)
 {
 	if (sm == NULL)
 		return;
 
 	VERIFY0(space_map_iterate(sm, space_map_length(sm),
 	    metaslab_spacemap_validation_cb, mv));
 }
 
 static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg);
 
 /*
  * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if
  * they are part of that metaslab (mv_msid).
  */
 static void
 mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
 {
 	zfs_btree_index_t where;
 	sublivelist_verify_block_t *svb;
 	ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0);
 	for (svb = zfs_btree_first(&sv->sv_leftover, &where);
 	    svb != NULL;
 	    svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) {
 		if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid)
 			continue;
 
 		if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start &&
 		    (DVA_GET_OFFSET(&svb->svb_dva) +
 		    DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) {
 			(void) printf("ERROR: Found block that crosses "
 			    "metaslab boundary: <%llu:%llx:%llx>\n",
 			    (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
 			    (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
 			    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
 			continue;
 		}
 
 		if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start)
 			continue;
 
 		if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end)
 			continue;
 
 		if ((DVA_GET_OFFSET(&svb->svb_dva) +
 		    DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) {
 			(void) printf("ERROR: Found block that crosses "
 			    "metaslab boundary: <%llu:%llx:%llx>\n",
 			    (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
 			    (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
 			    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
 			continue;
 		}
 
 		zfs_btree_add(&mv->mv_livelist_allocs, svb);
 	}
 
 	for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where);
 	    svb != NULL;
 	    svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
 		zfs_btree_remove(&sv->sv_leftover, svb);
 	}
 }
 
 /*
  * [Livelist Check]
  * Iterate through all the sublivelists and:
  * - report leftover frees (**)
  * - record leftover ALLOCs together with their TXG [see Cross Check]
  *
  * (**) Note: Double ALLOCs are valid in datasets that have dedup
  *      enabled. Similarly double FREEs are allowed as well but
  *      only if they pair up with a corresponding ALLOC entry once
  *      we our done with our sublivelist iteration.
  *
  * [Spacemap Check]
  * for each metaslab:
  * - iterate over spacemap and then the metaslab's entries in the
  *   spacemap log, then report any double FREEs and ALLOCs (do not
  *   blow up).
  *
  * [Cross Check]
  * After finishing the Livelist Check phase and while being in the
  * Spacemap Check phase, we find all the recorded leftover ALLOCs
  * of the livelist check that are part of the metaslab that we are
  * currently looking at in the Spacemap Check. We report any entries
  * that are marked as ALLOCs in the livelists but have been actually
  * freed (and potentially allocated again) after their TXG stamp in
  * the spacemaps. Also report any ALLOCs from the livelists that
  * belong to indirect vdevs (e.g. their vdev completed removal).
  *
  * Note that this will miss Log Spacemap entries that cancelled each other
  * out before being flushed to the metaslab, so we are not guaranteed
  * to match all erroneous ALLOCs.
  */
 static void
 livelist_metaslab_validate(spa_t *spa)
 {
 	(void) printf("Verifying deleted livelist entries\n");
 
 	sublivelist_verify_t sv;
 	zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
 	    sizeof (sublivelist_verify_block_t));
 	iterate_deleted_livelists(spa, livelist_verify, &sv);
 
 	(void) printf("Verifying metaslab entries\n");
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		if (!vdev_is_concrete(vd))
 			continue;
 
 		for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) {
 			metaslab_t *m = vd->vdev_ms[mid];
 
 			(void) fprintf(stderr,
 			    "\rverifying concrete vdev %llu, "
 			    "metaslab %llu of %llu ...",
 			    (longlong_t)vd->vdev_id,
 			    (longlong_t)mid,
 			    (longlong_t)vd->vdev_ms_count);
 
 			uint64_t shift, start;
 			zfs_range_seg_type_t type =
 			    metaslab_calculate_range_tree_type(vd, m,
 			    &start, &shift);
 			metaslab_verify_t mv;
 			mv.mv_allocated = zfs_range_tree_create(NULL,
 			    type, NULL, start, shift);
 			mv.mv_vdid = vd->vdev_id;
 			mv.mv_msid = m->ms_id;
 			mv.mv_start = m->ms_start;
 			mv.mv_end = m->ms_start + m->ms_size;
 			zfs_btree_create(&mv.mv_livelist_allocs,
 			    livelist_block_compare, NULL,
 			    sizeof (sublivelist_verify_block_t));
 
 			mv_populate_livelist_allocs(&mv, &sv);
 
 			spacemap_check_ms_sm(m->ms_sm, &mv);
 			spacemap_check_sm_log(spa, &mv);
 
 			zfs_range_tree_vacate(mv.mv_allocated, NULL, NULL);
 			zfs_range_tree_destroy(mv.mv_allocated);
 			zfs_btree_clear(&mv.mv_livelist_allocs);
 			zfs_btree_destroy(&mv.mv_livelist_allocs);
 		}
 	}
 	(void) fprintf(stderr, "\n");
 
 	/*
 	 * If there are any segments in the leftover tree after we walked
 	 * through all the metaslabs in the concrete vdevs then this means
 	 * that we have segments in the livelists that belong to indirect
 	 * vdevs and are marked as allocated.
 	 */
 	if (zfs_btree_numnodes(&sv.sv_leftover) == 0) {
 		zfs_btree_destroy(&sv.sv_leftover);
 		return;
 	}
 	(void) printf("ERROR: Found livelist blocks marked as allocated "
 	    "for indirect vdevs:\n");
 
 	zfs_btree_index_t *where = NULL;
 	sublivelist_verify_block_t *svb;
 	while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) !=
 	    NULL) {
 		int vdev_id = DVA_GET_VDEV(&svb->svb_dva);
 		ASSERT3U(vdev_id, <, rvd->vdev_children);
 		vdev_t *vd = rvd->vdev_child[vdev_id];
 		ASSERT(!vdev_is_concrete(vd));
 		(void) printf("<%d:%llx:%llx> TXG %llx\n",
 		    vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
 		    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva),
 		    (u_longlong_t)svb->svb_allocated_txg);
 	}
 	(void) printf("\n");
 	zfs_btree_destroy(&sv.sv_leftover);
 }
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
  */
 const char *
 _umem_debug_init(void)
 {
 	return ("default,verbose"); /* $UMEM_DEBUG setting */
 }
 
 const char *
 _umem_logging_init(void)
 {
 	return ("fail,contents"); /* $UMEM_LOGGING setting */
 }
 
 static void
 usage(void)
 {
 	(void) fprintf(stderr,
 	    "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] "
 	    "[-I <inflight I/Os>]\n"
 	    "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
 	    "\t\t[-K <key>]\n"
 	    "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n"
 	    "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] [-K <key>]\n"
 	    "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n"
 	    "\t%s -B [-e [-V] [-p <path> ...]] [-I <inflight I/Os>]\n"
 	    "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
 	    "\t\t[-K <key>] <poolname>/<objset id> [<backupflags>]\n"
 	    "\t%s [-v] <bookmark>\n"
 	    "\t%s -C [-A] [-U <cache>] [<poolname>]\n"
 	    "\t%s -l [-Aqu] <device>\n"
 	    "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
 	    "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
 	    "\t%s -O [-K <key>] <dataset> <path>\n"
 	    "\t%s -r [-K <key>] <dataset> <path> <destination>\n"
 	    "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
 	    "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
 	    "\t%s -E [-A] word0:word1:...:word15\n"
 	    "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
 	    "<poolname>\n\n",
 	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
 	    cmdname, cmdname, cmdname, cmdname, cmdname);
 
 	(void) fprintf(stderr, "    Dataset name must include at least one "
 	    "separator character '/' or '@'\n");
 	(void) fprintf(stderr, "    If dataset name is specified, only that "
 	    "dataset is dumped\n");
 	(void) fprintf(stderr,  "    If object numbers or object number "
 	    "ranges are specified, only those\n"
 	    "    objects or ranges are dumped.\n\n");
 	(void) fprintf(stderr,
 	    "    Object ranges take the form <start>:<end>[:<flags>]\n"
 	    "        start    Starting object number\n"
 	    "        end      Ending object number, or -1 for no upper bound\n"
 	    "        flags    Optional flags to select object types:\n"
 	    "            A     All objects (this is the default)\n"
 	    "            d     ZFS directories\n"
 	    "            f     ZFS files \n"
 	    "            m     SPA space maps\n"
 	    "            z     ZAPs\n"
 	    "            -     Negate effect of next flag\n\n");
 	(void) fprintf(stderr, "    Options to control amount of output:\n");
 	(void) fprintf(stderr, "        -b --block-stats             "
 	    "block statistics\n");
 	(void) fprintf(stderr, "        -B --backup                  "
 	    "backup stream\n");
 	(void) fprintf(stderr, "        -c --checksum                "
 	    "checksum all metadata (twice for all data) blocks\n");
 	(void) fprintf(stderr, "        -C --config                  "
 	    "config (or cachefile if alone)\n");
 	(void) fprintf(stderr, "        -d --datasets                "
 	    "dataset(s)\n");
 	(void) fprintf(stderr, "        -D --dedup-stats             "
 	    "dedup statistics\n");
 	(void) fprintf(stderr, "        -E --embedded-block-pointer=INTEGER\n"
 	    "                                     decode and display block "
 	    "from an embedded block pointer\n");
 	(void) fprintf(stderr, "        -h --history                 "
 	    "pool history\n");
 	(void) fprintf(stderr, "        -i --intent-logs             "
 	    "intent logs\n");
 	(void) fprintf(stderr, "        -l --label                   "
 	    "read label contents\n");
 	(void) fprintf(stderr, "        -k --checkpointed-state      "
 	    "examine the checkpointed state of the pool\n");
 	(void) fprintf(stderr, "        -L --disable-leak-tracking   "
 	    "disable leak tracking (do not load spacemaps)\n");
 	(void) fprintf(stderr, "        -m --metaslabs               "
 	    "metaslabs\n");
 	(void) fprintf(stderr, "        -M --metaslab-groups         "
 	    "metaslab groups\n");
 	(void) fprintf(stderr, "        -O --object-lookups          "
 	    "perform object lookups by path\n");
 	(void) fprintf(stderr, "        -r --copy-object             "
 	    "copy an object by path to file\n");
 	(void) fprintf(stderr, "        -R --read-block              "
 	    "read and display block from a device\n");
 	(void) fprintf(stderr, "        -s --io-stats                "
 	    "report stats on zdb's I/O\n");
 	(void) fprintf(stderr, "        -S --simulate-dedup          "
 	    "simulate dedup to measure effect\n");
 	(void) fprintf(stderr, "        -v --verbose                 "
 	    "verbose (applies to all others)\n");
 	(void) fprintf(stderr, "        -y --livelist                "
 	    "perform livelist and metaslab validation on any livelists being "
 	    "deleted\n\n");
 	(void) fprintf(stderr, "    Below options are intended for use "
 	    "with other options:\n");
 	(void) fprintf(stderr, "        -A --ignore-assertions       "
 	    "ignore assertions (-A), enable panic recovery (-AA) or both "
 	    "(-AAA)\n");
 	(void) fprintf(stderr, "        -e --exported                "
 	    "pool is exported/destroyed/has altroot/not in a cachefile\n");
 	(void) fprintf(stderr, "        -F --automatic-rewind        "
 	    "attempt automatic rewind within safe range of transaction "
 	    "groups\n");
 	(void) fprintf(stderr, "        -G --dump-debug-msg          "
 	    "dump zfs_dbgmsg buffer before exiting\n");
 	(void) fprintf(stderr, "        -I --inflight=INTEGER        "
 	    "specify the maximum number of checksumming I/Os "
 	    "[default is 200]\n");
 	(void) fprintf(stderr, "        -K --key=KEY                 "
 	    "decryption key for encrypted dataset\n");
 	(void) fprintf(stderr, "        -o --option=\"OPTION=INTEGER\" "
 	    "set global variable to an unsigned 32-bit integer\n");
 	(void) fprintf(stderr, "        -p --path==PATH              "
 	    "use one or more with -e to specify path to vdev dir\n");
 	(void) fprintf(stderr, "        -P --parseable               "
 	    "print numbers in parseable form\n");
 	(void) fprintf(stderr, "        -q --skip-label              "
 	    "don't print label contents\n");
 	(void) fprintf(stderr, "        -t --txg=INTEGER             "
 	    "highest txg to use when searching for uberblocks\n");
 	(void) fprintf(stderr, "        -T --brt-stats               "
 	    "BRT statistics\n");
 	(void) fprintf(stderr, "        -u --uberblock               "
 	    "uberblock\n");
 	(void) fprintf(stderr, "        -U --cachefile=PATH          "
 	    "use alternate cachefile\n");
 	(void) fprintf(stderr, "        -V --verbatim                "
 	    "do verbatim import\n");
 	(void) fprintf(stderr, "        -x --dump-blocks=PATH        "
 	    "dump all read blocks into specified directory\n");
 	(void) fprintf(stderr, "        -X --extreme-rewind          "
 	    "attempt extreme rewind (does not work with dataset)\n");
 	(void) fprintf(stderr, "        -Y --all-reconstruction      "
 	    "attempt all reconstruction combinations for split blocks\n");
 	(void) fprintf(stderr, "        -Z --zstd-headers            "
 	    "show ZSTD headers \n");
 	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
 	    "to make only that option verbose\n");
 	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
 	zdb_exit(1);
 }
 
 static void
 dump_debug_buffer(void)
 {
 	ssize_t ret __attribute__((unused));
 
 	if (!dump_opt['G'])
 		return;
 	/*
 	 * We use write() instead of printf() so that this function
 	 * is safe to call from a signal handler.
 	 */
 	ret = write(STDERR_FILENO, "\n", 1);
 	zfs_dbgmsg_print(STDERR_FILENO, "zdb");
 }
 
 static void sig_handler(int signo)
 {
 	struct sigaction action;
 
 	libspl_backtrace(STDERR_FILENO);
 	dump_debug_buffer();
 
 	/*
 	 * Restore default action and re-raise signal so SIGSEGV and
 	 * SIGABRT can trigger a core dump.
 	 */
 	action.sa_handler = SIG_DFL;
 	sigemptyset(&action.sa_mask);
 	action.sa_flags = 0;
 	(void) sigaction(signo, &action, NULL);
 	raise(signo);
 }
 
 /*
  * Called for usage errors that are discovered after a call to spa_open(),
  * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
  */
 
 static void
 fatal(const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	(void) fprintf(stderr, "%s: ", cmdname);
 	(void) vfprintf(stderr, fmt, ap);
 	va_end(ap);
 	(void) fprintf(stderr, "\n");
 
 	dump_debug_buffer();
 
 	zdb_exit(1);
 }
 
 static void
 dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) size;
 	nvlist_t *nv;
 	size_t nvsize = *(uint64_t *)data;
 	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
 
 	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
 
 	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
 
 	umem_free(packed, nvsize);
 
 	dump_nvlist(nv, 8);
 
 	nvlist_free(nv);
 }
 
 static void
 dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) size;
 	spa_history_phys_t *shp = data;
 
 	if (shp == NULL)
 		return;
 
 	(void) printf("\t\tpool_create_len = %llu\n",
 	    (u_longlong_t)shp->sh_pool_create_len);
 	(void) printf("\t\tphys_max_off = %llu\n",
 	    (u_longlong_t)shp->sh_phys_max_off);
 	(void) printf("\t\tbof = %llu\n",
 	    (u_longlong_t)shp->sh_bof);
 	(void) printf("\t\teof = %llu\n",
 	    (u_longlong_t)shp->sh_eof);
 	(void) printf("\t\trecords_lost = %llu\n",
 	    (u_longlong_t)shp->sh_records_lost);
 }
 
 static void
 zdb_nicenum(uint64_t num, char *buf, size_t buflen)
 {
 	if (dump_opt['P'])
 		(void) snprintf(buf, buflen, "%llu", (longlong_t)num);
 	else
 		nicenum(num, buf, buflen);
 }
 
 static void
 zdb_nicebytes(uint64_t bytes, char *buf, size_t buflen)
 {
 	if (dump_opt['P'])
 		(void) snprintf(buf, buflen, "%llu", (longlong_t)bytes);
 	else
 		zfs_nicebytes(bytes, buf, buflen);
 }
 
 static const char histo_stars[] = "****************************************";
 static const uint64_t histo_width = sizeof (histo_stars) - 1;
 
 static void
 dump_histogram(const uint64_t *histo, int size, int offset)
 {
 	int i;
 	int minidx = size - 1;
 	int maxidx = 0;
 	uint64_t max = 0;
 
 	for (i = 0; i < size; i++) {
 		if (histo[i] == 0)
 			continue;
 		if (histo[i] > max)
 			max = histo[i];
 		if (i > maxidx)
 			maxidx = i;
 		if (i < minidx)
 			minidx = i;
 	}
 
 	if (max < histo_width)
 		max = histo_width;
 
 	for (i = minidx; i <= maxidx; i++) {
 		(void) printf("\t\t\t%3u: %6llu %s\n",
 		    i + offset, (u_longlong_t)histo[i],
 		    &histo_stars[(max - histo[i]) * histo_width / max]);
 	}
 }
 
 static void
 dump_zap_stats(objset_t *os, uint64_t object)
 {
 	int error;
 	zap_stats_t zs;
 
 	error = zap_get_stats(os, object, &zs);
 	if (error)
 		return;
 
 	if (zs.zs_ptrtbl_len == 0) {
 		ASSERT(zs.zs_num_blocks == 1);
 		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
 		    (u_longlong_t)zs.zs_blocksize,
 		    (u_longlong_t)zs.zs_num_entries);
 		return;
 	}
 
 	(void) printf("\tFat ZAP stats:\n");
 
 	(void) printf("\t\tPointer table:\n");
 	(void) printf("\t\t\t%llu elements\n",
 	    (u_longlong_t)zs.zs_ptrtbl_len);
 	(void) printf("\t\t\tzt_blk: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_blk);
 	(void) printf("\t\t\tzt_numblks: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
 	(void) printf("\t\t\tzt_shift: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_shift);
 	(void) printf("\t\t\tzt_blks_copied: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_blks_copied);
 	(void) printf("\t\t\tzt_nextblk: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_nextblk);
 
 	(void) printf("\t\tZAP entries: %llu\n",
 	    (u_longlong_t)zs.zs_num_entries);
 	(void) printf("\t\tLeaf blocks: %llu\n",
 	    (u_longlong_t)zs.zs_num_leafs);
 	(void) printf("\t\tTotal blocks: %llu\n",
 	    (u_longlong_t)zs.zs_num_blocks);
 	(void) printf("\t\tzap_block_type: 0x%llx\n",
 	    (u_longlong_t)zs.zs_block_type);
 	(void) printf("\t\tzap_magic: 0x%llx\n",
 	    (u_longlong_t)zs.zs_magic);
 	(void) printf("\t\tzap_salt: 0x%llx\n",
 	    (u_longlong_t)zs.zs_salt);
 
 	(void) printf("\t\tLeafs with 2^n pointers:\n");
 	dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBlocks with n*5 entries:\n");
 	dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBlocks n/10 full:\n");
 	dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tEntries with n chunks:\n");
 	dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBuckets with n entries:\n");
 	dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
 }
 
 static void
 dump_none(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static void
 dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 	(void) printf("\tUNKNOWN OBJECT TYPE\n");
 }
 
 static void
 dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static void
 dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	uint64_t *arr;
 	uint64_t oursize;
 	if (dump_opt['d'] < 6)
 		return;
 
 	if (data == NULL) {
 		dmu_object_info_t doi;
 
 		VERIFY0(dmu_object_info(os, object, &doi));
 		size = doi.doi_max_offset;
 		/*
 		 * We cap the size at 1 mebibyte here to prevent
 		 * allocation failures and nigh-infinite printing if the
 		 * object is extremely large.
 		 */
 		oursize = MIN(size, 1 << 20);
 		arr = kmem_alloc(oursize, KM_SLEEP);
 
 		int err = dmu_read(os, object, 0, oursize, arr, 0);
 		if (err != 0) {
 			(void) printf("got error %u from dmu_read\n", err);
 			kmem_free(arr, oursize);
 			return;
 		}
 	} else {
 		/*
 		 * Even though the allocation is already done in this code path,
 		 * we still cap the size to prevent excessive printing.
 		 */
 		oursize = MIN(size, 1 << 20);
 		arr = data;
 	}
 
 	if (size == 0) {
 		if (data == NULL)
 			kmem_free(arr, oursize);
 		(void) printf("\t\t[]\n");
 		return;
 	}
 
 	(void) printf("\t\t[%0llx", (u_longlong_t)arr[0]);
 	for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) {
 		if (i % 4 != 0)
 			(void) printf(", %0llx", (u_longlong_t)arr[i]);
 		else
 			(void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]);
 	}
 	if (oursize != size)
 		(void) printf(", ... ");
 	(void) printf("]\n");
 
 	if (data == NULL)
 		kmem_free(arr, oursize);
 }
 
 static void
 dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	zap_cursor_t zc;
 	zap_attribute_t *attrp = zap_attribute_long_alloc();
 	void *prop;
 	unsigned i;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    zap_cursor_advance(&zc)) {
 		boolean_t key64 =
 		    !!(zap_getflags(zc.zc_zap) & ZAP_FLAG_UINT64_KEY);
 
 		if (key64)
 			(void) printf("\t\t0x%010" PRIu64 "x = ",
 			    *(uint64_t *)attrp->za_name);
 		else
 			(void) printf("\t\t%s = ", attrp->za_name);
 
 		if (attrp->za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 		prop = umem_zalloc(attrp->za_num_integers *
 		    attrp->za_integer_length, UMEM_NOFAIL);
 
 		if (key64)
 			(void) zap_lookup_uint64(os, object,
 			    (const uint64_t *)attrp->za_name, 1,
 			    attrp->za_integer_length, attrp->za_num_integers,
 			    prop);
 		else
 			(void) zap_lookup(os, object, attrp->za_name,
 			    attrp->za_integer_length, attrp->za_num_integers,
 			    prop);
 
 		if (attrp->za_integer_length == 1 && !key64) {
 			if (strcmp(attrp->za_name,
 			    DSL_CRYPTO_KEY_MASTER_KEY) == 0 ||
 			    strcmp(attrp->za_name,
 			    DSL_CRYPTO_KEY_HMAC_KEY) == 0 ||
 			    strcmp(attrp->za_name, DSL_CRYPTO_KEY_IV) == 0 ||
 			    strcmp(attrp->za_name, DSL_CRYPTO_KEY_MAC) == 0 ||
 			    strcmp(attrp->za_name,
 			    DMU_POOL_CHECKSUM_SALT) == 0) {
 				uint8_t *u8 = prop;
 
 				for (i = 0; i < attrp->za_num_integers; i++) {
 					(void) printf("%02x", u8[i]);
 				}
 			} else {
 				(void) printf("%s", (char *)prop);
 			}
 		} else {
 			for (i = 0; i < attrp->za_num_integers; i++) {
 				switch (attrp->za_integer_length) {
 				case 1:
 					(void) printf("%u ",
 					    ((uint8_t *)prop)[i]);
 					break;
 				case 2:
 					(void) printf("%u ",
 					    ((uint16_t *)prop)[i]);
 					break;
 				case 4:
 					(void) printf("%u ",
 					    ((uint32_t *)prop)[i]);
 					break;
 				case 8:
 					(void) printf("%lld ",
 					    (u_longlong_t)((int64_t *)prop)[i]);
 					break;
 				}
 			}
 		}
 		(void) printf("\n");
 		umem_free(prop,
 		    attrp->za_num_integers * attrp->za_integer_length);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(attrp);
 }
 
 static void
 dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	bpobj_phys_t *bpop = data;
 	uint64_t i;
 	char bytes[32], comp[32], uncomp[32];
 
 	/* make sure the output won't get truncated */
 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
 	_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
 	_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
 
 	if (bpop == NULL)
 		return;
 
 	zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));
 	zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));
 	zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));
 
 	(void) printf("\t\tnum_blkptrs = %llu\n",
 	    (u_longlong_t)bpop->bpo_num_blkptrs);
 	(void) printf("\t\tbytes = %s\n", bytes);
 	if (size >= BPOBJ_SIZE_V1) {
 		(void) printf("\t\tcomp = %s\n", comp);
 		(void) printf("\t\tuncomp = %s\n", uncomp);
 	}
 	if (size >= BPOBJ_SIZE_V2) {
 		(void) printf("\t\tsubobjs = %llu\n",
 		    (u_longlong_t)bpop->bpo_subobjs);
 		(void) printf("\t\tnum_subobjs = %llu\n",
 		    (u_longlong_t)bpop->bpo_num_subobjs);
 	}
 	if (size >= sizeof (*bpop)) {
 		(void) printf("\t\tnum_freed = %llu\n",
 		    (u_longlong_t)bpop->bpo_num_freed);
 	}
 
 	if (dump_opt['d'] < 5)
 		return;
 
 	for (i = 0; i < bpop->bpo_num_blkptrs; i++) {
 		char blkbuf[BP_SPRINTF_LEN];
 		blkptr_t bp;
 
 		int err = dmu_read(os, object,
 		    i * sizeof (bp), sizeof (bp), &bp, 0);
 		if (err != 0) {
 			(void) printf("got error %u from dmu_read\n", err);
 			break;
 		}
 		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp,
 		    BP_GET_FREE(&bp));
 		(void) printf("\t%s\n", blkbuf);
 	}
 }
 
 static void
 dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	dmu_object_info_t doi;
 	int64_t i;
 
 	VERIFY0(dmu_object_info(os, object, &doi));
 	uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
 
 	int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
 	if (err != 0) {
 		(void) printf("got error %u from dmu_read\n", err);
 		kmem_free(subobjs, doi.doi_max_offset);
 		return;
 	}
 
 	int64_t last_nonzero = -1;
 	for (i = 0; i < doi.doi_max_offset / 8; i++) {
 		if (subobjs[i] != 0)
 			last_nonzero = i;
 	}
 
 	for (i = 0; i <= last_nonzero; i++) {
 		(void) printf("\t%llu\n", (u_longlong_t)subobjs[i]);
 	}
 	kmem_free(subobjs, doi.doi_max_offset);
 }
 
 static void
 dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	dump_zap_stats(os, object);
 	/* contents are printed elsewhere, properly decoded */
 }
 
 static void
 dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	zap_cursor_t zc;
 	zap_attribute_t *attrp = zap_attribute_alloc();
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = ", attrp->za_name);
 		if (attrp->za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 		(void) printf(" %llx : [%d:%d:%d]\n",
 		    (u_longlong_t)attrp->za_first_integer,
 		    (int)ATTR_LENGTH(attrp->za_first_integer),
 		    (int)ATTR_BSWAP(attrp->za_first_integer),
 		    (int)ATTR_NUM(attrp->za_first_integer));
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(attrp);
 }
 
 static void
 dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	zap_cursor_t zc;
 	zap_attribute_t *attrp = zap_attribute_alloc();
 	uint16_t *layout_attrs;
 	unsigned i;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = [", attrp->za_name);
 		if (attrp->za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 
 		VERIFY(attrp->za_integer_length == 2);
 		layout_attrs = umem_zalloc(attrp->za_num_integers *
 		    attrp->za_integer_length, UMEM_NOFAIL);
 
 		VERIFY(zap_lookup(os, object, attrp->za_name,
 		    attrp->za_integer_length,
 		    attrp->za_num_integers, layout_attrs) == 0);
 
 		for (i = 0; i != attrp->za_num_integers; i++)
 			(void) printf(" %d ", (int)layout_attrs[i]);
 		(void) printf("]\n");
 		umem_free(layout_attrs,
 		    attrp->za_num_integers * attrp->za_integer_length);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(attrp);
 }
 
 static void
 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	zap_cursor_t zc;
 	zap_attribute_t *attrp = zap_attribute_long_alloc();
 	const char *typenames[] = {
 		/* 0 */ "not specified",
 		/* 1 */ "FIFO",
 		/* 2 */ "Character Device",
 		/* 3 */ "3 (invalid)",
 		/* 4 */ "Directory",
 		/* 5 */ "5 (invalid)",
 		/* 6 */ "Block Device",
 		/* 7 */ "7 (invalid)",
 		/* 8 */ "Regular File",
 		/* 9 */ "9 (invalid)",
 		/* 10 */ "Symbolic Link",
 		/* 11 */ "11 (invalid)",
 		/* 12 */ "Socket",
 		/* 13 */ "Door",
 		/* 14 */ "Event Port",
 		/* 15 */ "15 (invalid)",
 	};
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = %lld (type: %s)\n",
 		    attrp->za_name, ZFS_DIRENT_OBJ(attrp->za_first_integer),
 		    typenames[ZFS_DIRENT_TYPE(attrp->za_first_integer)]);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(attrp);
 }
 
 static int
 get_dtl_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		space_map_t *sm = vd->vdev_dtl_sm;
 
 		if (sm != NULL &&
 		    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
 			return (1);
 		return (0);
 	}
 
 	for (unsigned c = 0; c < vd->vdev_children; c++)
 		refcount += get_dtl_refcount(vd->vdev_child[c]);
 	return (refcount);
 }
 
 static int
 get_metaslab_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_top == vd) {
 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 			space_map_t *sm = vd->vdev_ms[m]->ms_sm;
 
 			if (sm != NULL &&
 			    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
 				refcount++;
 		}
 	}
 	for (unsigned c = 0; c < vd->vdev_children; c++)
 		refcount += get_metaslab_refcount(vd->vdev_child[c]);
 
 	return (refcount);
 }
 
 static int
 get_obsolete_refcount(vdev_t *vd)
 {
 	uint64_t obsolete_sm_object;
 	int refcount = 0;
 
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	if (vd->vdev_top == vd && obsolete_sm_object != 0) {
 		dmu_object_info_t doi;
 		VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,
 		    obsolete_sm_object, &doi));
 		if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
 			refcount++;
 		}
 	} else {
 		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
 		ASSERT3U(obsolete_sm_object, ==, 0);
 	}
 	for (unsigned c = 0; c < vd->vdev_children; c++) {
 		refcount += get_obsolete_refcount(vd->vdev_child[c]);
 	}
 
 	return (refcount);
 }
 
 static int
 get_prev_obsolete_spacemap_refcount(spa_t *spa)
 {
 	uint64_t prev_obj =
 	    spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object;
 	if (prev_obj != 0) {
 		dmu_object_info_t doi;
 		VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi));
 		if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
 			return (1);
 		}
 	}
 	return (0);
 }
 
 static int
 get_checkpoint_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
 	    zap_contains(spa_meta_objset(vd->vdev_spa),
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
 		refcount++;
 
 	for (uint64_t c = 0; c < vd->vdev_children; c++)
 		refcount += get_checkpoint_refcount(vd->vdev_child[c]);
 
 	return (refcount);
 }
 
 static int
 get_log_spacemap_refcount(spa_t *spa)
 {
 	return (avl_numnodes(&spa->spa_sm_logs_by_txg));
 }
 
 static int
 verify_spacemap_refcounts(spa_t *spa)
 {
 	uint64_t expected_refcount = 0;
 	uint64_t actual_refcount;
 
 	(void) feature_get_refcount(spa,
 	    &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
 	    &expected_refcount);
 	actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
 	actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
 	actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
 	actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
 	actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
 	actual_refcount += get_log_spacemap_refcount(spa);
 
 	if (expected_refcount != actual_refcount) {
 		(void) printf("space map refcount mismatch: expected %lld != "
 		    "actual %lld\n",
 		    (longlong_t)expected_refcount,
 		    (longlong_t)actual_refcount);
 		return (2);
 	}
 	return (0);
 }
 
 static void
 dump_spacemap(objset_t *os, space_map_t *sm)
 {
 	const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
 	    "INVALID", "INVALID", "INVALID", "INVALID" };
 
 	if (sm == NULL)
 		return;
 
 	(void) printf("space map object %llu:\n",
 	    (longlong_t)sm->sm_object);
 	(void) printf("  smp_length = 0x%llx\n",
 	    (longlong_t)sm->sm_phys->smp_length);
 	(void) printf("  smp_alloc = 0x%llx\n",
 	    (longlong_t)sm->sm_phys->smp_alloc);
 
 	if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
 		return;
 
 	/*
 	 * Print out the freelist entries in both encoded and decoded form.
 	 */
 	uint8_t mapshift = sm->sm_shift;
 	int64_t alloc = 0;
 	uint64_t word, entry_id = 0;
 	for (uint64_t offset = 0; offset < space_map_length(sm);
 	    offset += sizeof (word)) {
 
 		VERIFY0(dmu_read(os, space_map_object(sm), offset,
 		    sizeof (word), &word, DMU_READ_PREFETCH));
 
 		if (sm_entry_is_debug(word)) {
 			uint64_t de_txg = SM_DEBUG_TXG_DECODE(word);
 			uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word);
 			if (de_txg == 0) {
 				(void) printf(
 				    "\t    [%6llu] PADDING\n",
 				    (u_longlong_t)entry_id);
 			} else {
 				(void) printf(
 				    "\t    [%6llu] %s: txg %llu pass %llu\n",
 				    (u_longlong_t)entry_id,
 				    ddata[SM_DEBUG_ACTION_DECODE(word)],
 				    (u_longlong_t)de_txg,
 				    (u_longlong_t)de_sync_pass);
 			}
 			entry_id++;
 			continue;
 		}
 
 		uint8_t words;
 		char entry_type;
 		uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
 
 		if (sm_entry_is_single_word(word)) {
 			entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
 			    'A' : 'F';
 			entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
 			    sm->sm_start;
 			entry_run = SM_RUN_DECODE(word) << mapshift;
 			words = 1;
 		} else {
 			/* it is a two-word entry so we read another word */
 			ASSERT(sm_entry_is_double_word(word));
 
 			uint64_t extra_word;
 			offset += sizeof (extra_word);
 			VERIFY0(dmu_read(os, space_map_object(sm), offset,
 			    sizeof (extra_word), &extra_word,
 			    DMU_READ_PREFETCH));
 
 			ASSERT3U(offset, <=, space_map_length(sm));
 
 			entry_run = SM2_RUN_DECODE(word) << mapshift;
 			entry_vdev = SM2_VDEV_DECODE(word);
 			entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
 			    'A' : 'F';
 			entry_off = (SM2_OFFSET_DECODE(extra_word) <<
 			    mapshift) + sm->sm_start;
 			words = 2;
 		}
 
 		(void) printf("\t    [%6llu]    %c  range:"
 		    " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n",
 		    (u_longlong_t)entry_id,
 		    entry_type, (u_longlong_t)entry_off,
 		    (u_longlong_t)(entry_off + entry_run),
 		    (u_longlong_t)entry_run,
 		    (u_longlong_t)entry_vdev, words);
 
 		if (entry_type == 'A')
 			alloc += entry_run;
 		else
 			alloc -= entry_run;
 		entry_id++;
 	}
 	if (alloc != space_map_allocated(sm)) {
 		(void) printf("space_map_object alloc (%lld) INCONSISTENT "
 		    "with space map summary (%lld)\n",
 		    (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
 	}
 }
 
 static void
 dump_metaslab_stats(metaslab_t *msp)
 {
 	char maxbuf[32];
 	zfs_range_tree_t *rt = msp->ms_allocatable;
 	zfs_btree_t *t = &msp->ms_allocatable_by_size;
 	int free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size;
 
 	/* max sure nicenum has enough space */
 	_Static_assert(sizeof (maxbuf) >= NN_NUMBUF_SZ, "maxbuf truncated");
 
 	zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));
 
 	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
 	    "segments", zfs_btree_numnodes(t), "maxsize", maxbuf,
 	    "freepct", free_pct);
 	(void) printf("\tIn-memory histogram:\n");
-	dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
+	dump_histogram(rt->rt_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0);
 }
 
 static void
 dump_metaslab(metaslab_t *msp)
 {
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	space_map_t *sm = msp->ms_sm;
 	char freebuf[32];
 
 	zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,
 	    sizeof (freebuf));
 
 	(void) printf(
 	    "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
 	    (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
 	    (u_longlong_t)space_map_object(sm), freebuf);
 
 	if (dump_opt['m'] > 2 && !dump_opt['L']) {
 		mutex_enter(&msp->ms_lock);
 		VERIFY0(metaslab_load(msp));
 		zfs_range_tree_stat_verify(msp->ms_allocatable);
 		dump_metaslab_stats(msp);
 		metaslab_unload(msp);
 		mutex_exit(&msp->ms_lock);
 	}
 
 	if (dump_opt['m'] > 1 && sm != NULL &&
 	    spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
 		/*
 		 * The space map histogram represents free space in chunks
 		 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
 		 */
 		(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
 		    (u_longlong_t)msp->ms_fragmentation);
 		dump_histogram(sm->sm_phys->smp_histogram,
 		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
 	}
 
 	if (vd->vdev_ops == &vdev_draid_ops)
 		ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift);
 	else
 		ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift);
 
 	dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
 		(void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n",
 		    (u_longlong_t)metaslab_unflushed_txg(msp));
 	}
 }
 
 static void
 print_vdev_metaslab_header(vdev_t *vd)
 {
 	vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
 	const char *bias_str = "";
 	if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) {
 		bias_str = VDEV_ALLOC_BIAS_LOG;
 	} else if (alloc_bias == VDEV_BIAS_SPECIAL) {
 		bias_str = VDEV_ALLOC_BIAS_SPECIAL;
 	} else if (alloc_bias == VDEV_BIAS_DEDUP) {
 		bias_str = VDEV_ALLOC_BIAS_DEDUP;
 	}
 
 	uint64_t ms_flush_data_obj = 0;
 	if (vd->vdev_top_zap != 0) {
 		int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
 		    sizeof (uint64_t), 1, &ms_flush_data_obj);
 		if (error != ENOENT) {
 			ASSERT0(error);
 		}
 	}
 
 	(void) printf("\tvdev %10llu   %s",
 	    (u_longlong_t)vd->vdev_id, bias_str);
 
 	if (ms_flush_data_obj != 0) {
 		(void) printf("   ms_unflushed_phys object %llu",
 		    (u_longlong_t)ms_flush_data_obj);
 	}
 
 	(void) printf("\n\t%-10s%5llu   %-19s   %-15s   %-12s\n",
 	    "metaslabs", (u_longlong_t)vd->vdev_ms_count,
 	    "offset", "spacemap", "free");
 	(void) printf("\t%15s   %19s   %15s   %12s\n",
 	    "---------------", "-------------------",
 	    "---------------", "------------");
 }
 
 static void
 dump_metaslab_groups(spa_t *spa, boolean_t show_special)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	metaslab_class_t *mc = spa_normal_class(spa);
 	metaslab_class_t *smc = spa_special_class(spa);
 	uint64_t fragmentation;
 
 	metaslab_class_histogram_verify(mc);
 
 	for (unsigned c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (mg == NULL || (mg->mg_class != mc &&
 		    (!show_special || mg->mg_class != smc)))
 			continue;
 
 		metaslab_group_histogram_verify(mg);
 		mg->mg_fragmentation = metaslab_group_fragmentation(mg);
 
 		(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
 		    "fragmentation",
 		    (u_longlong_t)tvd->vdev_id,
 		    (u_longlong_t)tvd->vdev_ms_count);
 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 			(void) printf("%3s\n", "-");
 		} else {
 			(void) printf("%3llu%%\n",
 			    (u_longlong_t)mg->mg_fragmentation);
 		}
-		dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
+		dump_histogram(mg->mg_histogram,
+		    ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0);
 	}
 
 	(void) printf("\tpool %s\tfragmentation", spa_name(spa));
 	fragmentation = metaslab_class_fragmentation(mc);
 	if (fragmentation == ZFS_FRAG_INVALID)
 		(void) printf("\t%3s\n", "-");
 	else
 		(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
-	dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
+	dump_histogram(mc->mc_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0);
 }
 
 static void
 print_vdev_indirect(vdev_t *vd)
 {
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	vdev_indirect_births_t *vib = vd->vdev_indirect_births;
 
 	if (vim == NULL) {
 		ASSERT3P(vib, ==, NULL);
 		return;
 	}
 
 	ASSERT3U(vdev_indirect_mapping_object(vim), ==,
 	    vic->vic_mapping_object);
 	ASSERT3U(vdev_indirect_births_object(vib), ==,
 	    vic->vic_births_object);
 
 	(void) printf("indirect births obj %llu:\n",
 	    (longlong_t)vic->vic_births_object);
 	(void) printf("    vib_count = %llu\n",
 	    (longlong_t)vdev_indirect_births_count(vib));
 	for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) {
 		vdev_indirect_birth_entry_phys_t *cur_vibe =
 		    &vib->vib_entries[i];
 		(void) printf("\toffset %llx -> txg %llu\n",
 		    (longlong_t)cur_vibe->vibe_offset,
 		    (longlong_t)cur_vibe->vibe_phys_birth_txg);
 	}
 	(void) printf("\n");
 
 	(void) printf("indirect mapping obj %llu:\n",
 	    (longlong_t)vic->vic_mapping_object);
 	(void) printf("    vim_max_offset = 0x%llx\n",
 	    (longlong_t)vdev_indirect_mapping_max_offset(vim));
 	(void) printf("    vim_bytes_mapped = 0x%llx\n",
 	    (longlong_t)vdev_indirect_mapping_bytes_mapped(vim));
 	(void) printf("    vim_count = %llu\n",
 	    (longlong_t)vdev_indirect_mapping_num_entries(vim));
 
 	if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3)
 		return;
 
 	uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim);
 
 	for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
 		vdev_indirect_mapping_entry_phys_t *vimep =
 		    &vim->vim_entries[i];
 		(void) printf("\t<%llx:%llx:%llx> -> "
 		    "<%llx:%llx:%llx> (%x obsolete)\n",
 		    (longlong_t)vd->vdev_id,
 		    (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
 		    (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
 		    (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst),
 		    (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst),
 		    (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
 		    counts[i]);
 	}
 	(void) printf("\n");
 
 	uint64_t obsolete_sm_object;
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	if (obsolete_sm_object != 0) {
 		objset_t *mos = vd->vdev_spa->spa_meta_objset;
 		(void) printf("obsolete space map object %llu:\n",
 		    (u_longlong_t)obsolete_sm_object);
 		ASSERT(vd->vdev_obsolete_sm != NULL);
 		ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==,
 		    obsolete_sm_object);
 		dump_spacemap(mos, vd->vdev_obsolete_sm);
 		(void) printf("\n");
 	}
 }
 
 static void
 dump_metaslabs(spa_t *spa)
 {
 	vdev_t *vd, *rvd = spa->spa_root_vdev;
 	uint64_t m, c = 0, children = rvd->vdev_children;
 
 	(void) printf("\nMetaslabs:\n");
 
 	if (!dump_opt['d'] && zopt_metaslab_args > 0) {
 		c = zopt_metaslab[0];
 
 		if (c >= children)
 			(void) fatal("bad vdev id: %llu", (u_longlong_t)c);
 
 		if (zopt_metaslab_args > 1) {
 			vd = rvd->vdev_child[c];
 			print_vdev_metaslab_header(vd);
 
 			for (m = 1; m < zopt_metaslab_args; m++) {
 				if (zopt_metaslab[m] < vd->vdev_ms_count)
 					dump_metaslab(
 					    vd->vdev_ms[zopt_metaslab[m]]);
 				else
 					(void) fprintf(stderr, "bad metaslab "
 					    "number %llu\n",
 					    (u_longlong_t)zopt_metaslab[m]);
 			}
 			(void) printf("\n");
 			return;
 		}
 		children = c + 1;
 	}
 	for (; c < children; c++) {
 		vd = rvd->vdev_child[c];
 		print_vdev_metaslab_header(vd);
 
 		print_vdev_indirect(vd);
 
 		for (m = 0; m < vd->vdev_ms_count; m++)
 			dump_metaslab(vd->vdev_ms[m]);
 		(void) printf("\n");
 	}
 }
 
 static void
 dump_log_spacemaps(spa_t *spa)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	(void) printf("\nLog Space Maps in Pool:\n");
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
 		space_map_t *sm = NULL;
 		VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
 		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
 
 		(void) printf("Log Spacemap object %llu txg %llu\n",
 		    (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg);
 		dump_spacemap(spa->spa_meta_objset, sm);
 		space_map_close(sm);
 	}
 	(void) printf("\n");
 }
 
 static void
 dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
     uint64_t index)
 {
 	const ddt_key_t *ddk = &ddlwe->ddlwe_key;
 	char blkbuf[BP_SPRINTF_LEN];
 	blkptr_t blk;
 	int p;
 
 	for (p = 0; p < DDT_NPHYS(ddt); p++) {
 		const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
 		if (ddt_phys_birth(ddp, v) == 0)
 			continue;
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
 		(void) printf("index %llx refcnt %llu phys %d %s\n",
 		    (u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v),
 		    p, blkbuf);
 	}
 }
 
 static void
 dump_dedup_ratio(const ddt_stat_t *dds)
 {
 	double rL, rP, rD, D, dedup, compress, copies;
 
 	if (dds->dds_blocks == 0)
 		return;
 
 	rL = (double)dds->dds_ref_lsize;
 	rP = (double)dds->dds_ref_psize;
 	rD = (double)dds->dds_ref_dsize;
 	D = (double)dds->dds_dsize;
 
 	dedup = rD / D;
 	compress = rL / rP;
 	copies = rD / rP;
 
 	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
 	    "dedup * compress / copies = %.2f\n\n",
 	    dedup, compress, copies, dedup * compress / copies);
 }
 
 static void
 dump_ddt_log(ddt_t *ddt)
 {
 	if (ddt->ddt_version != DDT_VERSION_FDT ||
 	    !(ddt->ddt_flags & DDT_FLAG_LOG))
 		return;
 
 	for (int n = 0; n < 2; n++) {
 		ddt_log_t *ddl = &ddt->ddt_log[n];
 
 		char flagstr[64] = {0};
 		if (ddl->ddl_flags > 0) {
 			flagstr[0] = ' ';
 			int c = 1;
 			if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
 				c += strlcpy(&flagstr[c], " FLUSHING",
 				    sizeof (flagstr) - c);
 			if (ddl->ddl_flags & DDL_FLAG_CHECKPOINT)
 				c += strlcpy(&flagstr[c], " CHECKPOINT",
 				    sizeof (flagstr) - c);
 			if (ddl->ddl_flags &
 			    ~(DDL_FLAG_FLUSHING|DDL_FLAG_CHECKPOINT))
 				c += strlcpy(&flagstr[c], " UNKNOWN",
 				    sizeof (flagstr) - c);
 			flagstr[1] = '[';
 			flagstr[c++] = ']';
 		}
 
 		uint64_t count = avl_numnodes(&ddl->ddl_tree);
 
 		printf(DMU_POOL_DDT_LOG ": flags=0x%02x%s; obj=%llu; "
 		    "len=%llu; txg=%llu; entries=%llu\n",
 		    zio_checksum_table[ddt->ddt_checksum].ci_name, n,
 		    ddl->ddl_flags, flagstr,
 		    (u_longlong_t)ddl->ddl_object,
 		    (u_longlong_t)ddl->ddl_length,
 		    (u_longlong_t)ddl->ddl_first_txg, (u_longlong_t)count);
 
 		if (ddl->ddl_flags & DDL_FLAG_CHECKPOINT) {
 			const ddt_key_t *ddk = &ddl->ddl_checkpoint;
 			printf("    checkpoint: "
 			    "%016llx:%016llx:%016llx:%016llx:%016llx\n",
 			    (u_longlong_t)ddk->ddk_cksum.zc_word[0],
 			    (u_longlong_t)ddk->ddk_cksum.zc_word[1],
 			    (u_longlong_t)ddk->ddk_cksum.zc_word[2],
 			    (u_longlong_t)ddk->ddk_cksum.zc_word[3],
 			    (u_longlong_t)ddk->ddk_prop);
 		}
 
 		if (count == 0 || dump_opt['D'] < 4)
 			continue;
 
 		ddt_lightweight_entry_t ddlwe;
 		uint64_t index = 0;
 		for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
 		    ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) {
 			DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
 			dump_ddt_entry(ddt, &ddlwe, index++);
 		}
 	}
 }
 
 static void
 dump_ddt_object(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
 	char name[DDT_NAMELEN];
 	ddt_lightweight_entry_t ddlwe;
 	uint64_t walk = 0;
 	dmu_object_info_t doi;
 	uint64_t count, dspace, mspace;
 	int error;
 
 	error = ddt_object_info(ddt, type, class, &doi);
 
 	if (error == ENOENT)
 		return;
 	ASSERT(error == 0);
 
 	error = ddt_object_count(ddt, type, class, &count);
 	ASSERT(error == 0);
 	if (count == 0)
 		return;
 
 	dspace = doi.doi_physical_blocks_512 << 9;
 	mspace = doi.doi_fill_count * doi.doi_data_block_size;
 
 	ddt_object_name(ddt, type, class, name);
 
 	(void) printf("%s: dspace=%llu; mspace=%llu; entries=%llu\n", name,
 	    (u_longlong_t)dspace, (u_longlong_t)mspace, (u_longlong_t)count);
 
 	if (dump_opt['D'] < 3)
 		return;
 
 	zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
 
 	if (dump_opt['D'] < 4)
 		return;
 
 	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
 		return;
 
 	(void) printf("%s contents:\n\n", name);
 
 	while ((error = ddt_object_walk(ddt, type, class, &walk, &ddlwe)) == 0)
 		dump_ddt_entry(ddt, &ddlwe, walk);
 
 	ASSERT3U(error, ==, ENOENT);
 
 	(void) printf("\n");
 }
 
 static void
 dump_ddt(ddt_t *ddt)
 {
 	if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
 		return;
 
 	char flagstr[64] = {0};
 	if (ddt->ddt_flags > 0) {
 		flagstr[0] = ' ';
 		int c = 1;
 		if (ddt->ddt_flags & DDT_FLAG_FLAT)
 			c += strlcpy(&flagstr[c], " FLAT",
 			    sizeof (flagstr) - c);
 		if (ddt->ddt_flags & DDT_FLAG_LOG)
 			c += strlcpy(&flagstr[c], " LOG",
 			    sizeof (flagstr) - c);
 		if (ddt->ddt_flags & ~DDT_FLAG_MASK)
 			c += strlcpy(&flagstr[c], " UNKNOWN",
 			    sizeof (flagstr) - c);
 		flagstr[1] = '[';
 		flagstr[c] = ']';
 	}
 
 	printf("DDT-%s: version=%llu [%s]; flags=0x%02llx%s; rootobj=%llu\n",
 	    zio_checksum_table[ddt->ddt_checksum].ci_name,
 	    (u_longlong_t)ddt->ddt_version,
 	    (ddt->ddt_version == 0) ? "LEGACY" :
 	    (ddt->ddt_version == 1) ? "FDT" : "UNKNOWN",
 	    (u_longlong_t)ddt->ddt_flags, flagstr,
 	    (u_longlong_t)ddt->ddt_dir_object);
 
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++)
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++)
 			dump_ddt_object(ddt, type, class);
 
 	dump_ddt_log(ddt);
 }
 
 static void
 dump_all_ddts(spa_t *spa)
 {
 	ddt_histogram_t ddh_total = {{{0}}};
 	ddt_stat_t dds_total = {0};
 
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
 		dump_ddt(spa->spa_ddt[c]);
 
 	ddt_get_dedup_stats(spa, &dds_total);
 
 	if (dds_total.dds_blocks == 0) {
 		(void) printf("All DDTs are empty\n");
 		return;
 	}
 
 	(void) printf("\n");
 
 	if (dump_opt['D'] > 1) {
 		(void) printf("DDT histogram (aggregated over all DDTs):\n");
 		ddt_get_dedup_histogram(spa, &ddh_total);
 		zpool_dump_ddt(&dds_total, &ddh_total);
 	}
 
 	dump_dedup_ratio(&dds_total);
 
 	/*
 	 * Dump a histogram of unique class entry age
 	 */
 	if (dump_opt['D'] == 3 && getenv("ZDB_DDT_UNIQUE_AGE_HIST") != NULL) {
 		ddt_age_histo_t histogram;
 
 		(void) printf("DDT walk unique, building age histogram...\n");
 		ddt_prune_walk(spa, 0, &histogram);
 
 		/*
 		 * print out histogram for unique entry class birth
 		 */
 		if (histogram.dah_entries > 0) {
 			(void) printf("%5s  %9s  %4s\n",
 			    "age", "blocks", "amnt");
 			(void) printf("%5s  %9s  %4s\n",
 			    "-----", "---------", "----");
 			for (int i = 0; i < HIST_BINS; i++) {
 				(void) printf("%5d  %9d %4d%%\n", 1 << i,
 				    (int)histogram.dah_age_histo[i],
 				    (int)((histogram.dah_age_histo[i] * 100) /
 				    histogram.dah_entries));
 			}
 		}
 	}
 }
 
 static void
 dump_brt(spa_t *spa)
 {
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING)) {
 		printf("BRT: unsupported on this pool\n");
 		return;
 	}
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
 		printf("BRT: empty\n");
 		return;
 	}
 
 	char count[32], used[32], saved[32];
 	zdb_nicebytes(brt_get_used(spa), used, sizeof (used));
 	zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved));
 	uint64_t ratio = brt_get_ratio(spa);
 	printf("BRT: used %s; saved %s; ratio %llu.%02llux\n", used, saved,
 	    (u_longlong_t)(ratio / 100), (u_longlong_t)(ratio % 100));
 
 	if (dump_opt['T'] < 2)
 		return;
 
 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
 		if (!brtvd->bv_initiated) {
 			printf("BRT: vdev %" PRIu64 ": empty\n", vdevid);
 			continue;
 		}
 
 		zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count));
 		zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used));
 		zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved));
 		printf("BRT: vdev %" PRIu64 ": refcnt %s; used %s; saved %s\n",
 		    vdevid, count, used, saved);
 	}
 
 	if (dump_opt['T'] < 3)
 		return;
 
 	/* -TTT shows a per-vdev histograms; -TTTT shows all entries */
 	boolean_t do_histo = dump_opt['T'] == 3;
 
 	char dva[64];
 
 	if (!do_histo)
 		printf("\n%-16s %-10s\n", "DVA", "REFCNT");
 
 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
 		if (!brtvd->bv_initiated)
 			continue;
 
 		uint64_t counts[64] = {};
 
 		zap_cursor_t zc;
 		zap_attribute_t *za = zap_attribute_alloc();
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    brtvd->bv_mos_entries);
 		    zap_cursor_retrieve(&zc, za) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t refcnt;
 			VERIFY0(zap_lookup_uint64(spa->spa_meta_objset,
 			    brtvd->bv_mos_entries,
 			    (const uint64_t *)za->za_name, 1,
 			    za->za_integer_length, za->za_num_integers,
 			    &refcnt));
 
 			if (do_histo)
 				counts[highbit64(refcnt)]++;
 			else {
 				uint64_t offset =
 				    *(const uint64_t *)za->za_name;
 
 				snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx",
 				    vdevid, (u_longlong_t)offset);
 				printf("%-16s %-10llu\n", dva,
 				    (u_longlong_t)refcnt);
 			}
 		}
 		zap_cursor_fini(&zc);
 		zap_attribute_free(za);
 
 		if (do_histo) {
 			printf("\nBRT: vdev %" PRIu64
 			    ": DVAs with 2^n refcnts:\n", vdevid);
 			dump_histogram(counts, 64, 0);
 		}
 	}
 }
 
 static void
 dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
 {
 	char *prefix = arg;
 
 	(void) printf("%s [%llu,%llu) length %llu\n",
 	    prefix,
 	    (u_longlong_t)start,
 	    (u_longlong_t)(start + size),
 	    (u_longlong_t)(size));
 }
 
 static void
 dump_dtl(vdev_t *vd, int indent)
 {
 	spa_t *spa = vd->vdev_spa;
 	boolean_t required;
 	const char *name[DTL_TYPES] = { "missing", "partial", "scrub",
 		"outage" };
 	char prefix[256];
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 	required = vdev_dtl_required(vd);
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	if (indent == 0)
 		(void) printf("\nDirty time logs:\n\n");
 
 	(void) printf("\t%*s%s [%s]\n", indent, "",
 	    vd->vdev_path ? vd->vdev_path :
 	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
 	    required ? "DTL-required" : "DTL-expendable");
 
 	for (int t = 0; t < DTL_TYPES; t++) {
 		zfs_range_tree_t *rt = vd->vdev_dtl[t];
 		if (zfs_range_tree_space(rt) == 0)
 			continue;
 		(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
 		    indent + 2, "", name[t]);
 		zfs_range_tree_walk(rt, dump_dtl_seg, prefix);
 		if (dump_opt['d'] > 5 && vd->vdev_children == 0)
 			dump_spacemap(spa->spa_meta_objset,
 			    vd->vdev_dtl_sm);
 	}
 
 	for (unsigned c = 0; c < vd->vdev_children; c++)
 		dump_dtl(vd->vdev_child[c], indent + 4);
 }
 
 static void
 dump_history(spa_t *spa)
 {
 	nvlist_t **events = NULL;
 	char *buf;
 	uint64_t resid, len, off = 0;
 	uint_t num = 0;
 	int error;
 	char tbuf[30];
 
 	if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) {
 		(void) fprintf(stderr, "%s: unable to allocate I/O buffer\n",
 		    __func__);
 		return;
 	}
 
 	do {
 		len = SPA_OLD_MAXBLOCKSIZE;
 
 		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
 			(void) fprintf(stderr, "Unable to read history: "
 			    "error %d\n", error);
 			free(buf);
 			return;
 		}
 
 		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
 			break;
 
 		off -= resid;
 	} while (len != 0);
 
 	(void) printf("\nHistory:\n");
 	for (unsigned i = 0; i < num; i++) {
 		boolean_t printed = B_FALSE;
 
 		if (nvlist_exists(events[i], ZPOOL_HIST_TIME)) {
 			time_t tsec;
 			struct tm t;
 
 			tsec = fnvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_TIME);
 			(void) localtime_r(&tsec, &t);
 			(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
 		} else {
 			tbuf[0] = '\0';
 		}
 
 		if (nvlist_exists(events[i], ZPOOL_HIST_CMD)) {
 			(void) printf("%s %s\n", tbuf,
 			    fnvlist_lookup_string(events[i], ZPOOL_HIST_CMD));
 		} else if (nvlist_exists(events[i], ZPOOL_HIST_INT_EVENT)) {
 			uint64_t ievent;
 
 			ievent = fnvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_INT_EVENT);
 			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
 				goto next;
 
 			(void) printf(" %s [internal %s txg:%ju] %s\n",
 			    tbuf,
 			    zfs_history_event_names[ievent],
 			    fnvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_TXG),
 			    fnvlist_lookup_string(events[i],
 			    ZPOOL_HIST_INT_STR));
 		} else if (nvlist_exists(events[i], ZPOOL_HIST_INT_NAME)) {
 			(void) printf("%s [txg:%ju] %s", tbuf,
 			    fnvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_TXG),
 			    fnvlist_lookup_string(events[i],
 			    ZPOOL_HIST_INT_NAME));
 
 			if (nvlist_exists(events[i], ZPOOL_HIST_DSNAME)) {
 				(void) printf(" %s (%llu)",
 				    fnvlist_lookup_string(events[i],
 				    ZPOOL_HIST_DSNAME),
 				    (u_longlong_t)fnvlist_lookup_uint64(
 				    events[i],
 				    ZPOOL_HIST_DSID));
 			}
 
 			(void) printf(" %s\n", fnvlist_lookup_string(events[i],
 			    ZPOOL_HIST_INT_STR));
 		} else if (nvlist_exists(events[i], ZPOOL_HIST_IOCTL)) {
 			(void) printf("%s ioctl %s\n", tbuf,
 			    fnvlist_lookup_string(events[i],
 			    ZPOOL_HIST_IOCTL));
 
 			if (nvlist_exists(events[i], ZPOOL_HIST_INPUT_NVL)) {
 				(void) printf("    input:\n");
 				dump_nvlist(fnvlist_lookup_nvlist(events[i],
 				    ZPOOL_HIST_INPUT_NVL), 8);
 			}
 			if (nvlist_exists(events[i], ZPOOL_HIST_OUTPUT_NVL)) {
 				(void) printf("    output:\n");
 				dump_nvlist(fnvlist_lookup_nvlist(events[i],
 				    ZPOOL_HIST_OUTPUT_NVL), 8);
 			}
 			if (nvlist_exists(events[i], ZPOOL_HIST_ERRNO)) {
 				(void) printf("    errno: %lld\n",
 				    (longlong_t)fnvlist_lookup_int64(events[i],
 				    ZPOOL_HIST_ERRNO));
 			}
 		} else {
 			goto next;
 		}
 
 		printed = B_TRUE;
 next:
 		if (dump_opt['h'] > 1) {
 			if (!printed)
 				(void) printf("unrecognized record:\n");
 			dump_nvlist(events[i], 2);
 		}
 	}
 	free(buf);
 }
 
 static void
 dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static uint64_t
 blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
     const zbookmark_phys_t *zb)
 {
 	if (dnp == NULL) {
 		ASSERT(zb->zb_level < 0);
 		if (zb->zb_object == 0)
 			return (zb->zb_blkid);
 		return (zb->zb_blkid * BP_GET_LSIZE(bp));
 	}
 
 	ASSERT(zb->zb_level >= 0);
 
 	return ((zb->zb_blkid <<
 	    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 }
 
 static void
 snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen,
     const blkptr_t *bp)
 {
 	static abd_t *pabd = NULL;
 	void *buf;
 	zio_t *zio;
 	zfs_zstdhdr_t zstd_hdr;
 	int error;
 
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD)
 		return;
 
 	if (BP_IS_HOLE(bp))
 		return;
 
 	if (BP_IS_EMBEDDED(bp)) {
 		buf = malloc(SPA_MAXBLOCKSIZE);
 		if (buf == NULL) {
 			(void) fprintf(stderr, "out of memory\n");
 			zdb_exit(1);
 		}
 		decode_embedded_bp_compressed(bp, buf);
 		memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
 		free(buf);
 		zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
 		zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    " ZSTD:size=%u:version=%u:level=%u:EMBEDDED",
 		    zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),
 		    zfs_get_hdrlevel(&zstd_hdr));
 		return;
 	}
 
 	if (!pabd)
 		pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
 	zio = zio_root(spa, NULL, NULL, 0);
 
 	/* Decrypt but don't decompress so we can read the compression header */
 	zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL,
 	    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS,
 	    NULL));
 	error = zio_wait(zio);
 	if (error) {
 		(void) fprintf(stderr, "read failed: %d\n", error);
 		return;
 	}
 	buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp));
 	memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
 	zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
 	zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
 
 	(void) snprintf(blkbuf + strlen(blkbuf),
 	    buflen - strlen(blkbuf),
 	    " ZSTD:size=%u:version=%u:level=%u:NORMAL",
 	    zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),
 	    zfs_get_hdrlevel(&zstd_hdr));
 
 	abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp));
 }
 
 static void
 snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
     boolean_t bp_freed)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
 	int i;
 
 	if (dump_opt['b'] >= 6) {
 		snprintf_blkptr(blkbuf, buflen, bp);
 		if (bp_freed) {
 			(void) snprintf(blkbuf + strlen(blkbuf),
 			    buflen - strlen(blkbuf), " %s", "FREE");
 		}
 		return;
 	}
 
 	if (BP_IS_EMBEDDED(bp)) {
 		(void) sprintf(blkbuf,
 		    "EMBEDDED et=%u %llxL/%llxP B=%llu",
 		    (int)BPE_GET_ETYPE(bp),
 		    (u_longlong_t)BPE_GET_LSIZE(bp),
 		    (u_longlong_t)BPE_GET_PSIZE(bp),
 		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
 		return;
 	}
 
 	blkbuf[0] = '\0';
 
 	for (i = 0; i < ndvas; i++)
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf), "%llu:%llx:%llx ",
 		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
 		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
 		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
 
 	if (BP_IS_HOLE(bp)) {
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    "%llxL B=%llu",
 		    (u_longlong_t)BP_GET_LSIZE(bp),
 		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
 	} else {
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    "%llxL/%llxP F=%llu B=%llu/%llu",
 		    (u_longlong_t)BP_GET_LSIZE(bp),
 		    (u_longlong_t)BP_GET_PSIZE(bp),
 		    (u_longlong_t)BP_GET_FILL(bp),
 		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp),
 		    (u_longlong_t)BP_GET_BIRTH(bp));
 		if (bp_freed)
 			(void) snprintf(blkbuf + strlen(blkbuf),
 			    buflen - strlen(blkbuf), " %s", "FREE");
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    " cksum=%016llx:%016llx:%016llx:%016llx",
 		    (u_longlong_t)bp->blk_cksum.zc_word[0],
 		    (u_longlong_t)bp->blk_cksum.zc_word[1],
 		    (u_longlong_t)bp->blk_cksum.zc_word[2],
 		    (u_longlong_t)bp->blk_cksum.zc_word[3]);
 	}
 }
 
 static void
 print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb,
     const dnode_phys_t *dnp)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 	int l;
 
 	if (!BP_IS_EMBEDDED(bp)) {
 		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
 		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
 	}
 
 	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
 
 	ASSERT(zb->zb_level >= 0);
 
 	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
 		if (l == zb->zb_level) {
 			(void) printf("L%llx", (u_longlong_t)zb->zb_level);
 		} else {
 			(void) printf(" ");
 		}
 	}
 
 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE);
 	if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD)
 		snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s\n", blkbuf);
 }
 
 static int
 visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
     blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	int err = 0;
 
 	if (BP_GET_LOGICAL_BIRTH(bp) == 0)
 		return (0);
 
 	print_indirect(spa, bp, zb, dnp);
 
 	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		arc_buf_t *buf;
 		uint64_t fill = 0;
 		ASSERT(!BP_IS_REDACTED(bp));
 
 		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err)
 			return (err);
 		ASSERT(buf->b_data);
 
 		/* recursively visit blocks below this */
 		cbp = buf->b_data;
 		for (i = 0; i < epb; i++, cbp++) {
 			zbookmark_phys_t czb;
 
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			err = visit_indirect(spa, dnp, cbp, &czb);
 			if (err)
 				break;
 			fill += BP_GET_FILL(cbp);
 		}
 		if (!err)
 			ASSERT3U(fill, ==, BP_GET_FILL(bp));
 		arc_buf_destroy(buf, &buf);
 	}
 
 	return (err);
 }
 
 static void
 dump_indirect(dnode_t *dn)
 {
 	dnode_phys_t *dnp = dn->dn_phys;
 	zbookmark_phys_t czb;
 
 	(void) printf("Indirect blocks:\n");
 
 	SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
 	    dn->dn_object, dnp->dn_nlevels - 1, 0);
 	for (int j = 0; j < dnp->dn_nblkptr; j++) {
 		czb.zb_blkid = j;
 		(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
 		    &dnp->dn_blkptr[j], &czb);
 	}
 
 	(void) printf("\n");
 }
 
 static void
 dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object;
 	dsl_dir_phys_t *dd = data;
 	time_t crtime;
 	char nice[32];
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (nice) >= NN_NUMBUF_SZ, "nice truncated");
 
 	if (dd == NULL)
 		return;
 
 	ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
 
 	crtime = dd->dd_creation_time;
 	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
 	(void) printf("\t\thead_dataset_obj = %llu\n",
 	    (u_longlong_t)dd->dd_head_dataset_obj);
 	(void) printf("\t\tparent_dir_obj = %llu\n",
 	    (u_longlong_t)dd->dd_parent_obj);
 	(void) printf("\t\torigin_obj = %llu\n",
 	    (u_longlong_t)dd->dd_origin_obj);
 	(void) printf("\t\tchild_dir_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_child_dir_zapobj);
 	zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));
 	(void) printf("\t\tused_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));
 	(void) printf("\t\tcompressed_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));
 	(void) printf("\t\tuncompressed_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_quota, nice, sizeof (nice));
 	(void) printf("\t\tquota = %s\n", nice);
 	zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));
 	(void) printf("\t\treserved = %s\n", nice);
 	(void) printf("\t\tprops_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_props_zapobj);
 	(void) printf("\t\tdeleg_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_deleg_zapobj);
 	(void) printf("\t\tflags = %llx\n",
 	    (u_longlong_t)dd->dd_flags);
 
 #define	DO(which) \
 	zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \
 	    sizeof (nice)); \
 	(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
 	DO(HEAD);
 	DO(SNAP);
 	DO(CHILD);
 	DO(CHILD_RSRV);
 	DO(REFRSRV);
 #undef DO
 	(void) printf("\t\tclones = %llu\n",
 	    (u_longlong_t)dd->dd_clones);
 }
 
 static void
 dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object;
 	dsl_dataset_phys_t *ds = data;
 	time_t crtime;
 	char used[32], compressed[32], uncompressed[32], unique[32];
 	char blkbuf[BP_SPRINTF_LEN];
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (used) >= NN_NUMBUF_SZ, "used truncated");
 	_Static_assert(sizeof (compressed) >= NN_NUMBUF_SZ,
 	    "compressed truncated");
 	_Static_assert(sizeof (uncompressed) >= NN_NUMBUF_SZ,
 	    "uncompressed truncated");
 	_Static_assert(sizeof (unique) >= NN_NUMBUF_SZ, "unique truncated");
 
 	if (ds == NULL)
 		return;
 
 	ASSERT(size == sizeof (*ds));
 	crtime = ds->ds_creation_time;
 	zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));
 	zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));
 	zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,
 	    sizeof (uncompressed));
 	zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
 
 	(void) printf("\t\tdir_obj = %llu\n",
 	    (u_longlong_t)ds->ds_dir_obj);
 	(void) printf("\t\tprev_snap_obj = %llu\n",
 	    (u_longlong_t)ds->ds_prev_snap_obj);
 	(void) printf("\t\tprev_snap_txg = %llu\n",
 	    (u_longlong_t)ds->ds_prev_snap_txg);
 	(void) printf("\t\tnext_snap_obj = %llu\n",
 	    (u_longlong_t)ds->ds_next_snap_obj);
 	(void) printf("\t\tsnapnames_zapobj = %llu\n",
 	    (u_longlong_t)ds->ds_snapnames_zapobj);
 	(void) printf("\t\tnum_children = %llu\n",
 	    (u_longlong_t)ds->ds_num_children);
 	(void) printf("\t\tuserrefs_obj = %llu\n",
 	    (u_longlong_t)ds->ds_userrefs_obj);
 	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
 	(void) printf("\t\tcreation_txg = %llu\n",
 	    (u_longlong_t)ds->ds_creation_txg);
 	(void) printf("\t\tdeadlist_obj = %llu\n",
 	    (u_longlong_t)ds->ds_deadlist_obj);
 	(void) printf("\t\tused_bytes = %s\n", used);
 	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
 	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
 	(void) printf("\t\tunique = %s\n", unique);
 	(void) printf("\t\tfsid_guid = %llu\n",
 	    (u_longlong_t)ds->ds_fsid_guid);
 	(void) printf("\t\tguid = %llu\n",
 	    (u_longlong_t)ds->ds_guid);
 	(void) printf("\t\tflags = %llx\n",
 	    (u_longlong_t)ds->ds_flags);
 	(void) printf("\t\tnext_clones_obj = %llu\n",
 	    (u_longlong_t)ds->ds_next_clones_obj);
 	(void) printf("\t\tprops_obj = %llu\n",
 	    (u_longlong_t)ds->ds_props_obj);
 	(void) printf("\t\tbp = %s\n", blkbuf);
 }
 
 static int
 dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	(void) arg, (void) tx;
 	char blkbuf[BP_SPRINTF_LEN];
 
 	if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("\t%s\n", blkbuf);
 	}
 	return (0);
 }
 
 static void
 dump_bptree(objset_t *os, uint64_t obj, const char *name)
 {
 	char bytes[32];
 	bptree_phys_t *bt;
 	dmu_buf_t *db;
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
 	bt = db->db_data;
 	zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));
 	(void) printf("\n    %s: %llu datasets, %s\n",
 	    name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
 	dmu_buf_rele(db, FTAG);
 
 	if (dump_opt['d'] < 5)
 		return;
 
 	(void) printf("\n");
 
 	(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
 }
 
 static int
 dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 {
 	(void) arg, (void) tx;
 	char blkbuf[BP_SPRINTF_LEN];
 
 	ASSERT(BP_GET_LOGICAL_BIRTH(bp) != 0);
 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
 	(void) printf("\t%s\n", blkbuf);
 	return (0);
 }
 
 static void
 dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
 {
 	char bytes[32];
 	char comp[32];
 	char uncomp[32];
 	uint64_t i;
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
 	_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
 	_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));
 	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
 		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
 		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
 		if (bpo->bpo_havefreed) {
 			(void) printf("    %*s: object %llu, %llu local "
 			    "blkptrs, %llu freed, %llu subobjs in object %llu, "
 			    "%s (%s/%s comp)\n",
 			    indent * 8, name,
 			    (u_longlong_t)bpo->bpo_object,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
 			    bytes, comp, uncomp);
 		} else {
 			(void) printf("    %*s: object %llu, %llu local "
 			    "blkptrs, %llu subobjs in object %llu, "
 			    "%s (%s/%s comp)\n",
 			    indent * 8, name,
 			    (u_longlong_t)bpo->bpo_object,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
 			    bytes, comp, uncomp);
 		}
 
 		for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
 			uint64_t subobj;
 			bpobj_t subbpo;
 			int error;
 			VERIFY0(dmu_read(bpo->bpo_os,
 			    bpo->bpo_phys->bpo_subobjs,
 			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
 			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
 			if (error != 0) {
 				(void) printf("ERROR %u while trying to open "
 				    "subobj id %llu\n",
 				    error, (u_longlong_t)subobj);
 				continue;
 			}
 			dump_full_bpobj(&subbpo, "subobj", indent + 1);
 			bpobj_close(&subbpo);
 		}
 	} else {
 		if (bpo->bpo_havefreed) {
 			(void) printf("    %*s: object %llu, %llu blkptrs, "
 			    "%llu freed, %s\n",
 			    indent * 8, name,
 			    (u_longlong_t)bpo->bpo_object,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
 			    bytes);
 		} else {
 			(void) printf("    %*s: object %llu, %llu blkptrs, "
 			    "%s\n",
 			    indent * 8, name,
 			    (u_longlong_t)bpo->bpo_object,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 			    bytes);
 		}
 	}
 
 	if (dump_opt['d'] < 5)
 		return;
 
 
 	if (indent == 0) {
 		(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
 		(void) printf("\n");
 	}
 }
 
 static int
 dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact,
     boolean_t print_list)
 {
 	int err = 0;
 	zfs_bookmark_phys_t prop;
 	objset_t *mos = dp->dp_spa->spa_meta_objset;
 	err = dsl_bookmark_lookup(dp, name, NULL, &prop);
 
 	if (err != 0) {
 		return (err);
 	}
 
 	(void) printf("\t#%s: ", strchr(name, '#') + 1);
 	(void) printf("{guid: %llx creation_txg: %llu creation_time: "
 	    "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid,
 	    (u_longlong_t)prop.zbm_creation_txg,
 	    (u_longlong_t)prop.zbm_creation_time,
 	    (u_longlong_t)prop.zbm_redaction_obj);
 
 	IMPLY(print_list, print_redact);
 	if (!print_redact || prop.zbm_redaction_obj == 0)
 		return (0);
 
 	redaction_list_t *rl;
 	VERIFY0(dsl_redaction_list_hold_obj(dp,
 	    prop.zbm_redaction_obj, FTAG, &rl));
 
 	redaction_list_phys_t *rlp = rl->rl_phys;
 	(void) printf("\tRedacted:\n\t\tProgress: ");
 	if (rlp->rlp_last_object != UINT64_MAX ||
 	    rlp->rlp_last_blkid != UINT64_MAX) {
 		(void) printf("%llu %llu (incomplete)\n",
 		    (u_longlong_t)rlp->rlp_last_object,
 		    (u_longlong_t)rlp->rlp_last_blkid);
 	} else {
 		(void) printf("complete\n");
 	}
 	(void) printf("\t\tSnapshots: [");
 	for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) {
 		if (i > 0)
 			(void) printf(", ");
 		(void) printf("%0llu",
 		    (u_longlong_t)rlp->rlp_snaps[i]);
 	}
 	(void) printf("]\n\t\tLength: %llu\n",
 	    (u_longlong_t)rlp->rlp_num_entries);
 
 	if (!print_list) {
 		dsl_redaction_list_rele(rl, FTAG);
 		return (0);
 	}
 
 	if (rlp->rlp_num_entries == 0) {
 		dsl_redaction_list_rele(rl, FTAG);
 		(void) printf("\t\tRedaction List: []\n\n");
 		return (0);
 	}
 
 	redact_block_phys_t *rbp_buf;
 	uint64_t size;
 	dmu_object_info_t doi;
 
 	VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi));
 	size = doi.doi_max_offset;
 	rbp_buf = kmem_alloc(size, KM_SLEEP);
 
 	err = dmu_read(mos, prop.zbm_redaction_obj, 0, size,
 	    rbp_buf, 0);
 	if (err != 0) {
 		dsl_redaction_list_rele(rl, FTAG);
 		kmem_free(rbp_buf, size);
 		return (err);
 	}
 
 	(void) printf("\t\tRedaction List: [{object: %llx, offset: "
 	    "%llx, blksz: %x, count: %llx}",
 	    (u_longlong_t)rbp_buf[0].rbp_object,
 	    (u_longlong_t)rbp_buf[0].rbp_blkid,
 	    (uint_t)(redact_block_get_size(&rbp_buf[0])),
 	    (u_longlong_t)redact_block_get_count(&rbp_buf[0]));
 
 	for (size_t i = 1; i < rlp->rlp_num_entries; i++) {
 		(void) printf(",\n\t\t{object: %llx, offset: %llx, "
 		    "blksz: %x, count: %llx}",
 		    (u_longlong_t)rbp_buf[i].rbp_object,
 		    (u_longlong_t)rbp_buf[i].rbp_blkid,
 		    (uint_t)(redact_block_get_size(&rbp_buf[i])),
 		    (u_longlong_t)redact_block_get_count(&rbp_buf[i]));
 	}
 	dsl_redaction_list_rele(rl, FTAG);
 	kmem_free(rbp_buf, size);
 	(void) printf("]\n\n");
 	return (0);
 }
 
 static void
 dump_bookmarks(objset_t *os, int verbosity)
 {
 	zap_cursor_t zc;
 	zap_attribute_t *attrp;
 	dsl_dataset_t *ds = dmu_objset_ds(os);
 	dsl_pool_t *dp = spa_get_dsl(os->os_spa);
 	objset_t *mos = os->os_spa->spa_meta_objset;
 	if (verbosity < 4)
 		return;
 	attrp = zap_attribute_alloc();
 	dsl_pool_config_enter(dp, FTAG);
 
 	for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    zap_cursor_advance(&zc)) {
 		char osname[ZFS_MAX_DATASET_NAME_LEN];
 		char buf[ZFS_MAX_DATASET_NAME_LEN];
 		int len;
 		dmu_objset_name(os, osname);
 		len = snprintf(buf, sizeof (buf), "%s#%s", osname,
 		    attrp->za_name);
 		VERIFY3S(len, <, ZFS_MAX_DATASET_NAME_LEN);
 		(void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6);
 	}
 	zap_cursor_fini(&zc);
 	dsl_pool_config_exit(dp, FTAG);
 	zap_attribute_free(attrp);
 }
 
 static void
 bpobj_count_refd(bpobj_t *bpo)
 {
 	mos_obj_refd(bpo->bpo_object);
 
 	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
 		mos_obj_refd(bpo->bpo_phys->bpo_subobjs);
 		for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
 			uint64_t subobj;
 			bpobj_t subbpo;
 			int error;
 			VERIFY0(dmu_read(bpo->bpo_os,
 			    bpo->bpo_phys->bpo_subobjs,
 			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
 			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
 			if (error != 0) {
 				(void) printf("ERROR %u while trying to open "
 				    "subobj id %llu\n",
 				    error, (u_longlong_t)subobj);
 				continue;
 			}
 			bpobj_count_refd(&subbpo);
 			bpobj_close(&subbpo);
 		}
 	}
 }
 
 static int
 dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle)
 {
 	spa_t *spa = arg;
 	uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
 	if (dle->dle_bpobj.bpo_object != empty_bpobj)
 		bpobj_count_refd(&dle->dle_bpobj);
 	return (0);
 }
 
 static int
 dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle)
 {
 	ASSERT(arg == NULL);
 	if (dump_opt['d'] >= 5) {
 		char buf[128];
 		(void) snprintf(buf, sizeof (buf),
 		    "mintxg %llu -> obj %llu",
 		    (longlong_t)dle->dle_mintxg,
 		    (longlong_t)dle->dle_bpobj.bpo_object);
 
 		dump_full_bpobj(&dle->dle_bpobj, buf, 0);
 	} else {
 		(void) printf("mintxg %llu -> obj %llu\n",
 		    (longlong_t)dle->dle_mintxg,
 		    (longlong_t)dle->dle_bpobj.bpo_object);
 	}
 	return (0);
 }
 
 static void
 dump_blkptr_list(dsl_deadlist_t *dl, const char *name)
 {
 	char bytes[32];
 	char comp[32];
 	char uncomp[32];
 	char entries[32];
 	spa_t *spa = dmu_objset_spa(dl->dl_os);
 	uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
 
 	if (dl->dl_oldfmt) {
 		if (dl->dl_bpobj.bpo_object != empty_bpobj)
 			bpobj_count_refd(&dl->dl_bpobj);
 	} else {
 		mos_obj_refd(dl->dl_object);
 		dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa);
 	}
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
 	_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
 	_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
 	_Static_assert(sizeof (entries) >= NN_NUMBUF_SZ, "entries truncated");
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	if (dl->dl_oldfmt) {
 		dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
 		return;
 	}
 
 	zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
 	zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
 	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
 	zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries));
 	(void) printf("\n    %s: %s (%s/%s comp), %s entries\n",
 	    name, bytes, comp, uncomp, entries);
 
 	if (dump_opt['d'] < 4)
 		return;
 
 	(void) putchar('\n');
 
 	dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL);
 }
 
 static int
 verify_dd_livelist(objset_t *os)
 {
 	uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp;
 	dsl_pool_t *dp = spa_get_dsl(os->os_spa);
 	dsl_dir_t  *dd = os->os_dsl_dataset->ds_dir;
 
 	ASSERT(!dmu_objset_is_snapshot(os));
 	if (!dsl_deadlist_is_open(&dd->dd_livelist))
 		return (0);
 
 	/* Iterate through the livelist to check for duplicates */
 	dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight,
 	    NULL);
 
 	dsl_pool_config_enter(dp, FTAG);
 	dsl_deadlist_space(&dd->dd_livelist, &ll_used,
 	    &ll_comp, &ll_uncomp);
 
 	dsl_dataset_t *origin_ds;
 	ASSERT(dsl_pool_config_held(dp));
 	VERIFY0(dsl_dataset_hold_obj(dp,
 	    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds));
 	VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset,
 	    &used, &comp, &uncomp));
 	dsl_dataset_rele(origin_ds, FTAG);
 	dsl_pool_config_exit(dp, FTAG);
 	/*
 	 *  It's possible that the dataset's uncomp space is larger than the
 	 *  livelist's because livelists do not track embedded block pointers
 	 */
 	if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) {
 		char nice_used[32], nice_comp[32], nice_uncomp[32];
 		(void) printf("Discrepancy in space accounting:\n");
 		zdb_nicenum(used, nice_used, sizeof (nice_used));
 		zdb_nicenum(comp, nice_comp, sizeof (nice_comp));
 		zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp));
 		(void) printf("dir: used %s, comp %s, uncomp %s\n",
 		    nice_used, nice_comp, nice_uncomp);
 		zdb_nicenum(ll_used, nice_used, sizeof (nice_used));
 		zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp));
 		zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp));
 		(void) printf("livelist: used %s, comp %s, uncomp %s\n",
 		    nice_used, nice_comp, nice_uncomp);
 		return (1);
 	}
 	return (0);
 }
 
 static char *key_material = NULL;
 
 static boolean_t
 zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out)
 {
 	uint64_t keyformat, salt, iters;
 	int i;
 	unsigned char c;
 
 	VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
 	    zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t),
 	    1, &keyformat));
 
 	switch (keyformat) {
 	case ZFS_KEYFORMAT_HEX:
 		for (i = 0; i < WRAPPING_KEY_LEN * 2; i += 2) {
 			if (!isxdigit(key_material[i]) ||
 			    !isxdigit(key_material[i+1]))
 				return (B_FALSE);
 			if (sscanf(&key_material[i], "%02hhx", &c) != 1)
 				return (B_FALSE);
 			key_out[i / 2] = c;
 		}
 		break;
 
 	case ZFS_KEYFORMAT_PASSPHRASE:
 		VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset,
 		    dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT),
 		    sizeof (uint64_t), 1, &salt));
 		VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset,
 		    dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS),
 		    sizeof (uint64_t), 1, &iters));
 
 		if (PKCS5_PBKDF2_HMAC_SHA1(key_material, strlen(key_material),
 		    ((uint8_t *)&salt), sizeof (uint64_t), iters,
 		    WRAPPING_KEY_LEN, key_out) != 1)
 			return (B_FALSE);
 
 		break;
 
 	default:
 		fatal("no support for key format %u\n",
 		    (unsigned int) keyformat);
 	}
 
 	return (B_TRUE);
 }
 
 static char encroot[ZFS_MAX_DATASET_NAME_LEN];
 static boolean_t key_loaded = B_FALSE;
 
 static void
 zdb_load_key(objset_t *os)
 {
 	dsl_pool_t *dp;
 	dsl_dir_t *dd, *rdd;
 	uint8_t key[WRAPPING_KEY_LEN];
 	uint64_t rddobj;
 	int err;
 
 	dp = spa_get_dsl(os->os_spa);
 	dd = os->os_dsl_dataset->ds_dir;
 
 	dsl_pool_config_enter(dp, FTAG);
 	VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
 	    DSL_CRYPTO_KEY_ROOT_DDOBJ, sizeof (uint64_t), 1, &rddobj));
 	VERIFY0(dsl_dir_hold_obj(dd->dd_pool, rddobj, NULL, FTAG, &rdd));
 	dsl_dir_name(rdd, encroot);
 	dsl_dir_rele(rdd, FTAG);
 
 	if (!zdb_derive_key(dd, key))
 		fatal("couldn't derive encryption key");
 
 	dsl_pool_config_exit(dp, FTAG);
 
 	ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_UNAVAILABLE);
 
 	dsl_crypto_params_t *dcp;
 	nvlist_t *crypto_args;
 
 	crypto_args = fnvlist_alloc();
 	fnvlist_add_uint8_array(crypto_args, "wkeydata",
 	    (uint8_t *)key, WRAPPING_KEY_LEN);
 	VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
 	    NULL, crypto_args, &dcp));
 	err = spa_keystore_load_wkey(encroot, dcp, B_FALSE);
 
 	dsl_crypto_params_free(dcp, (err != 0));
 	fnvlist_free(crypto_args);
 
 	if (err != 0)
 		fatal(
 		    "couldn't load encryption key for %s: %s",
 		    encroot, err == ZFS_ERR_CRYPTO_NOTSUP ?
 		    "crypto params not supported" : strerror(err));
 
 	ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_AVAILABLE);
 
 	printf("Unlocked encryption root: %s\n", encroot);
 	key_loaded = B_TRUE;
 }
 
 static void
 zdb_unload_key(void)
 {
 	if (!key_loaded)
 		return;
 
 	VERIFY0(spa_keystore_unload_wkey(encroot));
 	key_loaded = B_FALSE;
 }
 
 static avl_tree_t idx_tree;
 static avl_tree_t domain_tree;
 static boolean_t fuid_table_loaded;
 static objset_t *sa_os = NULL;
 static sa_attr_type_t *sa_attr_table = NULL;
 
 static int
 open_objset(const char *path, const void *tag, objset_t **osp)
 {
 	int err;
 	uint64_t sa_attrs = 0;
 	uint64_t version = 0;
 
 	VERIFY3P(sa_os, ==, NULL);
 
 	/*
 	 * We can't own an objset if it's redacted.  Therefore, we do this
 	 * dance: hold the objset, then acquire a long hold on its dataset, then
 	 * release the pool (which is held as part of holding the objset).
 	 */
 
 	if (dump_opt['K']) {
 		/* decryption requested, try to load keys */
 		err = dmu_objset_hold(path, tag, osp);
 		if (err != 0) {
 			(void) fprintf(stderr, "failed to hold dataset "
 			    "'%s': %s\n",
 			    path, strerror(err));
 			return (err);
 		}
 		dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
 		dsl_pool_rele(dmu_objset_pool(*osp), tag);
 
 		/* succeeds or dies */
 		zdb_load_key(*osp);
 
 		/* release it all */
 		dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
 		dsl_dataset_rele(dmu_objset_ds(*osp), tag);
 	}
 
 	int ds_hold_flags = key_loaded ? DS_HOLD_FLAG_DECRYPT : 0;
 
 	err = dmu_objset_hold_flags(path, ds_hold_flags, tag, osp);
 	if (err != 0) {
 		(void) fprintf(stderr, "failed to hold dataset '%s': %s\n",
 		    path, strerror(err));
 		return (err);
 	}
 	dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
 	dsl_pool_rele(dmu_objset_pool(*osp), tag);
 
 	if (dmu_objset_type(*osp) == DMU_OST_ZFS &&
 	    (key_loaded || !(*osp)->os_encrypted)) {
 		(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
 		    8, 1, &version);
 		if (version >= ZPL_VERSION_SA) {
 			(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
 			    8, 1, &sa_attrs);
 		}
 		err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
 		    &sa_attr_table);
 		if (err != 0) {
 			(void) fprintf(stderr, "sa_setup failed: %s\n",
 			    strerror(err));
 			dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
 			dsl_dataset_rele_flags(dmu_objset_ds(*osp),
 			    ds_hold_flags, tag);
 			*osp = NULL;
 		}
 	}
 	sa_os = *osp;
 
 	return (err);
 }
 
 static void
 close_objset(objset_t *os, const void *tag)
 {
 	VERIFY3P(os, ==, sa_os);
 	if (os->os_sa != NULL)
 		sa_tear_down(os);
 	dsl_dataset_long_rele(dmu_objset_ds(os), tag);
 	dsl_dataset_rele_flags(dmu_objset_ds(os),
 	    key_loaded ? DS_HOLD_FLAG_DECRYPT : 0, tag);
 	sa_attr_table = NULL;
 	sa_os = NULL;
 
 	zdb_unload_key();
 }
 
 static void
 fuid_table_destroy(void)
 {
 	if (fuid_table_loaded) {
 		zfs_fuid_table_destroy(&idx_tree, &domain_tree);
 		fuid_table_loaded = B_FALSE;
 	}
 }
 
 /*
  * Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on
  * a live pool are normally cleaned up during ddt_sync(). We can't do that (and
  * wouldn't want to anyway), but if we don't clean up the presence of stuff on
  * ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves.
  *
  * Note that this is not a particularly efficient way to do this, but
  * ddt_remove() is the only public method that can do the work we need, and it
  * requires the right locks and etc to do the job. This is only ever called
  * during zdb shutdown so efficiency is not especially important.
  */
 static void
 zdb_ddt_cleanup(spa_t *spa)
 {
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (!ddt)
 			continue;
 
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		ddt_enter(ddt);
 		ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next;
 		while (dde) {
 			next = AVL_NEXT(&ddt->ddt_tree, dde);
 			dde->dde_io = NULL;
 			ddt_remove(ddt, dde);
 			dde = next;
 		}
 		ddt_exit(ddt);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 }
 
 static void
 zdb_exit(int reason)
 {
 	if (spa != NULL)
 		zdb_ddt_cleanup(spa);
 
 	if (os != NULL) {
 		close_objset(os, FTAG);
 	} else if (spa != NULL) {
 		spa_close(spa, FTAG);
 	}
 
 	fuid_table_destroy();
 
 	if (kernel_init_done)
 		kernel_fini();
 
 	exit(reason);
 }
 
 /*
  * print uid or gid information.
  * For normal POSIX id just the id is printed in decimal format.
  * For CIFS files with FUID the fuid is printed in hex followed by
  * the domain-rid string.
  */
 static void
 print_idstr(uint64_t id, const char *id_type)
 {
 	if (FUID_INDEX(id)) {
 		const char *domain =
 		    zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
 		(void) printf("\t%s     %llx [%s-%d]\n", id_type,
 		    (u_longlong_t)id, domain, (int)FUID_RID(id));
 	} else {
 		(void) printf("\t%s     %llu\n", id_type, (u_longlong_t)id);
 	}
 
 }
 
 static void
 dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
 {
 	uint32_t uid_idx, gid_idx;
 
 	uid_idx = FUID_INDEX(uid);
 	gid_idx = FUID_INDEX(gid);
 
 	/* Load domain table, if not already loaded */
 	if (!fuid_table_loaded && (uid_idx || gid_idx)) {
 		uint64_t fuid_obj;
 
 		/* first find the fuid object.  It lives in the master node */
 		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
 		    8, 1, &fuid_obj) == 0);
 		zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
 		(void) zfs_fuid_table_load(os, fuid_obj,
 		    &idx_tree, &domain_tree);
 		fuid_table_loaded = B_TRUE;
 	}
 
 	print_idstr(uid, "uid");
 	print_idstr(gid, "gid");
 }
 
 static void
 dump_znode_sa_xattr(sa_handle_t *hdl)
 {
 	nvlist_t *sa_xattr;
 	nvpair_t *elem = NULL;
 	int sa_xattr_size = 0;
 	int sa_xattr_entries = 0;
 	int error;
 	char *sa_xattr_packed;
 
 	error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size);
 	if (error || sa_xattr_size == 0)
 		return;
 
 	sa_xattr_packed = malloc(sa_xattr_size);
 	if (sa_xattr_packed == NULL)
 		return;
 
 	error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR],
 	    sa_xattr_packed, sa_xattr_size);
 	if (error) {
 		free(sa_xattr_packed);
 		return;
 	}
 
 	error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0);
 	if (error) {
 		free(sa_xattr_packed);
 		return;
 	}
 
 	while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL)
 		sa_xattr_entries++;
 
 	(void) printf("\tSA xattrs: %d bytes, %d entries\n\n",
 	    sa_xattr_size, sa_xattr_entries);
 	while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) {
 		boolean_t can_print = !dump_opt['P'];
 		uchar_t *value;
 		uint_t cnt, idx;
 
 		(void) printf("\t\t%s = ", nvpair_name(elem));
 		nvpair_value_byte_array(elem, &value, &cnt);
 
 		for (idx = 0; idx < cnt; ++idx) {
 			if (!isprint(value[idx])) {
 				can_print = B_FALSE;
 				break;
 			}
 		}
 
 		for (idx = 0; idx < cnt; ++idx) {
 			if (can_print)
 				(void) putchar(value[idx]);
 			else
 				(void) printf("\\%3.3o", value[idx]);
 		}
 		(void) putchar('\n');
 	}
 
 	nvlist_free(sa_xattr);
 	free(sa_xattr_packed);
 }
 
 static void
 dump_znode_symlink(sa_handle_t *hdl)
 {
 	int sa_symlink_size = 0;
 	char linktarget[MAXPATHLEN];
 	int error;
 
 	error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size);
 	if (error || sa_symlink_size == 0) {
 		return;
 	}
 	if (sa_symlink_size >= sizeof (linktarget)) {
 		(void) printf("symlink size %d is too large\n",
 		    sa_symlink_size);
 		return;
 	}
 	linktarget[sa_symlink_size] = '\0';
 	if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK],
 	    &linktarget, sa_symlink_size) == 0)
 		(void) printf("\ttarget	%s\n", linktarget);
 }
 
 static void
 dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
 	sa_handle_t *hdl;
 	uint64_t xattr, rdev, gen;
 	uint64_t uid, gid, mode, fsize, parent, links;
 	uint64_t pflags;
 	uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
 	time_t z_crtime, z_atime, z_mtime, z_ctime;
 	sa_bulk_attr_t bulk[12];
 	int idx = 0;
 	int error;
 
 	VERIFY3P(os, ==, sa_os);
 	if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
 		(void) printf("Failed to get handle for SA znode\n");
 		return;
 	}
 
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
 	    &links, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
 	    &mode, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
 	    NULL, &parent, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
 	    &fsize, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
 	    acctm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
 	    modtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
 	    crtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
 	    chgtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
 	    &pflags, 8);
 
 	if (sa_bulk_lookup(hdl, bulk, idx)) {
 		(void) sa_handle_destroy(hdl);
 		return;
 	}
 
 	z_crtime = (time_t)crtm[0];
 	z_atime = (time_t)acctm[0];
 	z_mtime = (time_t)modtm[0];
 	z_ctime = (time_t)chgtm[0];
 
 	if (dump_opt['d'] > 4) {
 		error = zfs_obj_to_path(os, object, path, sizeof (path));
 		if (error == ESTALE) {
 			(void) snprintf(path, sizeof (path), "on delete queue");
 		} else if (error != 0) {
 			leaked_objects++;
 			(void) snprintf(path, sizeof (path),
 			    "path not found, possibly leaked");
 		}
 		(void) printf("\tpath	%s\n", path);
 	}
 
 	if (S_ISLNK(mode))
 		dump_znode_symlink(hdl);
 	dump_uidgid(os, uid, gid);
 	(void) printf("\tatime	%s", ctime(&z_atime));
 	(void) printf("\tmtime	%s", ctime(&z_mtime));
 	(void) printf("\tctime	%s", ctime(&z_ctime));
 	(void) printf("\tcrtime	%s", ctime(&z_crtime));
 	(void) printf("\tgen	%llu\n", (u_longlong_t)gen);
 	(void) printf("\tmode	%llo\n", (u_longlong_t)mode);
 	(void) printf("\tsize	%llu\n", (u_longlong_t)fsize);
 	(void) printf("\tparent	%llu\n", (u_longlong_t)parent);
 	(void) printf("\tlinks	%llu\n", (u_longlong_t)links);
 	(void) printf("\tpflags	%llx\n", (u_longlong_t)pflags);
 	if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) {
 		uint64_t projid;
 
 		if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid,
 		    sizeof (uint64_t)) == 0)
 			(void) printf("\tprojid	%llu\n", (u_longlong_t)projid);
 	}
 	if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
 	    sizeof (uint64_t)) == 0)
 		(void) printf("\txattr	%llu\n", (u_longlong_t)xattr);
 	if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
 	    sizeof (uint64_t)) == 0)
 		(void) printf("\trdev	0x%016llx\n", (u_longlong_t)rdev);
 	dump_znode_sa_xattr(hdl);
 	sa_handle_destroy(hdl);
 }
 
 static void
 dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static void
 dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
 	dump_none,		/* unallocated			*/
 	dump_zap,		/* object directory		*/
 	dump_uint64,		/* object array			*/
 	dump_none,		/* packed nvlist		*/
 	dump_packed_nvlist,	/* packed nvlist size		*/
 	dump_none,		/* bpobj			*/
 	dump_bpobj,		/* bpobj header			*/
 	dump_none,		/* SPA space map header		*/
 	dump_none,		/* SPA space map		*/
 	dump_none,		/* ZIL intent log		*/
 	dump_dnode,		/* DMU dnode			*/
 	dump_dmu_objset,	/* DMU objset			*/
 	dump_dsl_dir,		/* DSL directory		*/
 	dump_zap,		/* DSL directory child map	*/
 	dump_zap,		/* DSL dataset snap map		*/
 	dump_zap,		/* DSL props			*/
 	dump_dsl_dataset,	/* DSL dataset			*/
 	dump_znode,		/* ZFS znode			*/
 	dump_acl,		/* ZFS V0 ACL			*/
 	dump_uint8,		/* ZFS plain file		*/
 	dump_zpldir,		/* ZFS directory		*/
 	dump_zap,		/* ZFS master node		*/
 	dump_zap,		/* ZFS delete queue		*/
 	dump_uint8,		/* zvol object			*/
 	dump_zap,		/* zvol prop			*/
 	dump_uint8,		/* other uint8[]		*/
 	dump_uint64,		/* other uint64[]		*/
 	dump_zap,		/* other ZAP			*/
 	dump_zap,		/* persistent error log		*/
 	dump_uint8,		/* SPA history			*/
 	dump_history_offsets,	/* SPA history offsets		*/
 	dump_zap,		/* Pool properties		*/
 	dump_zap,		/* DSL permissions		*/
 	dump_acl,		/* ZFS ACL			*/
 	dump_uint8,		/* ZFS SYSACL			*/
 	dump_none,		/* FUID nvlist			*/
 	dump_packed_nvlist,	/* FUID nvlist size		*/
 	dump_zap,		/* DSL dataset next clones	*/
 	dump_zap,		/* DSL scrub queue		*/
 	dump_zap,		/* ZFS user/group/project used	*/
 	dump_zap,		/* ZFS user/group/project quota	*/
 	dump_zap,		/* snapshot refcount tags	*/
 	dump_ddt_zap,		/* DDT ZAP object		*/
 	dump_zap,		/* DDT statistics		*/
 	dump_znode,		/* SA object			*/
 	dump_zap,		/* SA Master Node		*/
 	dump_sa_attrs,		/* SA attribute registration	*/
 	dump_sa_layouts,	/* SA attribute layouts		*/
 	dump_zap,		/* DSL scrub translations	*/
 	dump_none,		/* fake dedup BP		*/
 	dump_zap,		/* deadlist			*/
 	dump_none,		/* deadlist hdr			*/
 	dump_zap,		/* dsl clones			*/
 	dump_bpobj_subobjs,	/* bpobj subobjs		*/
 	dump_unknown,		/* Unknown type, must be last	*/
 };
 
 static boolean_t
 match_object_type(dmu_object_type_t obj_type, uint64_t flags)
 {
 	boolean_t match = B_TRUE;
 
 	switch (obj_type) {
 	case DMU_OT_DIRECTORY_CONTENTS:
 		if (!(flags & ZOR_FLAG_DIRECTORY))
 			match = B_FALSE;
 		break;
 	case DMU_OT_PLAIN_FILE_CONTENTS:
 		if (!(flags & ZOR_FLAG_PLAIN_FILE))
 			match = B_FALSE;
 		break;
 	case DMU_OT_SPACE_MAP:
 		if (!(flags & ZOR_FLAG_SPACE_MAP))
 			match = B_FALSE;
 		break;
 	default:
 		if (strcmp(zdb_ot_name(obj_type), "zap") == 0) {
 			if (!(flags & ZOR_FLAG_ZAP))
 				match = B_FALSE;
 			break;
 		}
 
 		/*
 		 * If all bits except some of the supported flags are
 		 * set, the user combined the all-types flag (A) with
 		 * a negated flag to exclude some types (e.g. A-f to
 		 * show all object types except plain files).
 		 */
 		if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES)
 			match = B_FALSE;
 
 		break;
 	}
 
 	return (match);
 }
 
 static void
 dump_object(objset_t *os, uint64_t object, int verbosity,
     boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags)
 {
 	dmu_buf_t *db = NULL;
 	dmu_object_info_t doi;
 	dnode_t *dn;
 	boolean_t dnode_held = B_FALSE;
 	void *bonus = NULL;
 	size_t bsize = 0;
 	char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32];
 	char bonus_size[32];
 	char aux[50];
 	int error;
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (iblk) >= NN_NUMBUF_SZ, "iblk truncated");
 	_Static_assert(sizeof (dblk) >= NN_NUMBUF_SZ, "dblk truncated");
 	_Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, "lsize truncated");
 	_Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, "asize truncated");
 	_Static_assert(sizeof (bonus_size) >= NN_NUMBUF_SZ,
 	    "bonus_size truncated");
 
 	if (*print_header) {
 		(void) printf("\n%10s  %3s  %5s  %5s  %5s  %6s  %5s  %6s  %s\n",
 		    "Object", "lvl", "iblk", "dblk", "dsize", "dnsize",
 		    "lsize", "%full", "type");
 		*print_header = 0;
 	}
 
 	if (object == 0) {
 		dn = DMU_META_DNODE(os);
 		dmu_object_info_from_dnode(dn, &doi);
 	} else {
 		/*
 		 * Encrypted datasets will have sensitive bonus buffers
 		 * encrypted. Therefore we cannot hold the bonus buffer and
 		 * must hold the dnode itself instead.
 		 */
 		error = dmu_object_info(os, object, &doi);
 		if (error)
 			fatal("dmu_object_info() failed, errno %u", error);
 
 		if (!key_loaded && os->os_encrypted &&
 		    DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) {
 			error = dnode_hold(os, object, FTAG, &dn);
 			if (error)
 				fatal("dnode_hold() failed, errno %u", error);
 			dnode_held = B_TRUE;
 		} else {
 			error = dmu_bonus_hold(os, object, FTAG, &db);
 			if (error)
 				fatal("dmu_bonus_hold(%llu) failed, errno %u",
 				    object, error);
 			bonus = db->db_data;
 			bsize = db->db_size;
 			dn = DB_DNODE((dmu_buf_impl_t *)db);
 		}
 	}
 
 	/*
 	 * Default to showing all object types if no flags were specified.
 	 */
 	if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES &&
 	    !match_object_type(doi.doi_type, flags))
 		goto out;
 
 	if (dnode_slots_used)
 		*dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
 
 	zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
 	zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
 	zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
 	zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));
 	zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));
 	zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize));
 	(void) snprintf(fill, sizeof (fill), "%6.2f", 100.0 *
 	    doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ?
 	    DNODES_PER_BLOCK : 1) / doi.doi_max_offset);
 
 	aux[0] = '\0';
 
 	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
 		    " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum));
 	}
 
 	if (doi.doi_compress == ZIO_COMPRESS_INHERIT &&
 	    ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) {
 		const char *compname = NULL;
 		if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION,
 		    ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel),
 		    &compname) == 0) {
 			(void) snprintf(aux + strlen(aux),
 			    sizeof (aux) - strlen(aux), " (Z=inherit=%s)",
 			    compname);
 		} else {
 			(void) snprintf(aux + strlen(aux),
 			    sizeof (aux) - strlen(aux),
 			    " (Z=inherit=%s-unknown)",
 			    ZDB_COMPRESS_NAME(os->os_compress));
 		}
 	} else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
 		    " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress));
 	} else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
 		    " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress));
 	}
 
 	(void) printf("%10lld  %3u  %5s  %5s  %5s  %6s  %5s  %6s  %s%s\n",
 	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
 	    asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux);
 
 	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
 		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %5s  %6s  %s\n",
 		    "", "", "", "", "", "", bonus_size, "bonus",
 		    zdb_ot_name(doi.doi_bonus_type));
 	}
 
 	if (verbosity >= 4) {
 		(void) printf("\tdnode flags: %s%s%s%s\n",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
 		    "USED_BYTES " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
 		    "USERUSED_ACCOUNTED " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ?
 		    "USEROBJUSED_ACCOUNTED " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
 		    "SPILL_BLKPTR" : "");
 		(void) printf("\tdnode maxblkid: %llu\n",
 		    (longlong_t)dn->dn_phys->dn_maxblkid);
 
 		if (!dnode_held) {
 			object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os,
 			    object, bonus, bsize);
 		} else {
 			(void) printf("\t\t(bonus encrypted)\n");
 		}
 
 		if (key_loaded ||
 		    (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type))) {
 			object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object,
 			    NULL, 0);
 		} else {
 			(void) printf("\t\t(object encrypted)\n");
 		}
 
 		*print_header = B_TRUE;
 	}
 
 	if (verbosity >= 5) {
 		if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 			char blkbuf[BP_SPRINTF_LEN];
 			snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
 			    DN_SPILL_BLKPTR(dn->dn_phys), B_FALSE);
 			(void) printf("\nSpill block: %s\n", blkbuf);
 		}
 		dump_indirect(dn);
 	}
 
 	if (verbosity >= 5) {
 		/*
 		 * Report the list of segments that comprise the object.
 		 */
 		uint64_t start = 0;
 		uint64_t end;
 		uint64_t blkfill = 1;
 		int minlvl = 1;
 
 		if (dn->dn_type == DMU_OT_DNODE) {
 			minlvl = 0;
 			blkfill = DNODES_PER_BLOCK;
 		}
 
 		for (;;) {
 			char segsize[32];
 			/* make sure nicenum has enough space */
 			_Static_assert(sizeof (segsize) >= NN_NUMBUF_SZ,
 			    "segsize truncated");
 			error = dnode_next_offset(dn,
 			    0, &start, minlvl, blkfill, 0);
 			if (error)
 				break;
 			end = start;
 			error = dnode_next_offset(dn,
 			    DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
 			zdb_nicenum(end - start, segsize, sizeof (segsize));
 			(void) printf("\t\tsegment [%016llx, %016llx)"
 			    " size %5s\n", (u_longlong_t)start,
 			    (u_longlong_t)end, segsize);
 			if (error)
 				break;
 			start = end;
 		}
 	}
 
 out:
 	if (db != NULL)
 		dmu_buf_rele(db, FTAG);
 	if (dnode_held)
 		dnode_rele(dn, FTAG);
 }
 
 static void
 count_dir_mos_objects(dsl_dir_t *dd)
 {
 	mos_obj_refd(dd->dd_object);
 	mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj);
 	mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj);
 	mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj);
 	mos_obj_refd(dsl_dir_phys(dd)->dd_clones);
 
 	/*
 	 * The dd_crypto_obj can be referenced by multiple dsl_dir's.
 	 * Ignore the references after the first one.
 	 */
 	mos_obj_refd_multiple(dd->dd_crypto_obj);
 }
 
 static void
 count_ds_mos_objects(dsl_dataset_t *ds)
 {
 	mos_obj_refd(ds->ds_object);
 	mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj);
 	mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj);
 	mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj);
 	mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj);
 	mos_obj_refd(ds->ds_bookmarks_obj);
 
 	if (!dsl_dataset_is_snapshot(ds)) {
 		count_dir_mos_objects(ds->ds_dir);
 	}
 }
 
 static const char *const objset_types[DMU_OST_NUMTYPES] = {
 	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
 
 /*
  * Parse a string denoting a range of object IDs of the form
  * <start>[:<end>[:flags]], and store the results in zor.
  * Return 0 on success. On error, return 1 and update the msg
  * pointer to point to a descriptive error message.
  */
 static int
 parse_object_range(char *range, zopt_object_range_t *zor, const char **msg)
 {
 	uint64_t flags = 0;
 	char *p, *s, *dup, *flagstr, *tmp = NULL;
 	size_t len;
 	int i;
 	int rc = 0;
 
 	if (strchr(range, ':') == NULL) {
 		zor->zor_obj_start = strtoull(range, &p, 0);
 		if (*p != '\0') {
 			*msg = "Invalid characters in object ID";
 			rc = 1;
 		}
 		zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start);
 		zor->zor_obj_end = zor->zor_obj_start;
 		return (rc);
 	}
 
 	if (strchr(range, ':') == range) {
 		*msg = "Invalid leading colon";
 		rc = 1;
 		return (rc);
 	}
 
 	len = strlen(range);
 	if (range[len - 1] == ':') {
 		*msg = "Invalid trailing colon";
 		rc = 1;
 		return (rc);
 	}
 
 	dup = strdup(range);
 	s = strtok_r(dup, ":", &tmp);
 	zor->zor_obj_start = strtoull(s, &p, 0);
 
 	if (*p != '\0') {
 		*msg = "Invalid characters in start object ID";
 		rc = 1;
 		goto out;
 	}
 
 	s = strtok_r(NULL, ":", &tmp);
 	zor->zor_obj_end = strtoull(s, &p, 0);
 
 	if (*p != '\0') {
 		*msg = "Invalid characters in end object ID";
 		rc = 1;
 		goto out;
 	}
 
 	if (zor->zor_obj_start > zor->zor_obj_end) {
 		*msg = "Start object ID may not exceed end object ID";
 		rc = 1;
 		goto out;
 	}
 
 	s = strtok_r(NULL, ":", &tmp);
 	if (s == NULL) {
 		zor->zor_flags = ZOR_FLAG_ALL_TYPES;
 		goto out;
 	} else if (strtok_r(NULL, ":", &tmp) != NULL) {
 		*msg = "Invalid colon-delimited field after flags";
 		rc = 1;
 		goto out;
 	}
 
 	flagstr = s;
 	for (i = 0; flagstr[i]; i++) {
 		int bit;
 		boolean_t negation = (flagstr[i] == '-');
 
 		if (negation) {
 			i++;
 			if (flagstr[i] == '\0') {
 				*msg = "Invalid trailing negation operator";
 				rc = 1;
 				goto out;
 			}
 		}
 		bit = flagbits[(uchar_t)flagstr[i]];
 		if (bit == 0) {
 			*msg = "Invalid flag";
 			rc = 1;
 			goto out;
 		}
 		if (negation)
 			flags &= ~bit;
 		else
 			flags |= bit;
 	}
 	zor->zor_flags = flags;
 
 	zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start);
 	zor->zor_obj_end = ZDB_MAP_OBJECT_ID(zor->zor_obj_end);
 
 out:
 	free(dup);
 	return (rc);
 }
 
 static void
 dump_objset(objset_t *os)
 {
 	dmu_objset_stats_t dds = { 0 };
 	uint64_t object, object_count;
 	uint64_t refdbytes, usedobjs, scratch;
 	char numbuf[32];
 	char blkbuf[BP_SPRINTF_LEN + 20];
 	char osname[ZFS_MAX_DATASET_NAME_LEN];
 	const char *type = "UNKNOWN";
 	int verbosity = dump_opt['d'];
 	boolean_t print_header;
 	unsigned i;
 	int error;
 	uint64_t total_slots_used = 0;
 	uint64_t max_slot_used = 0;
 	uint64_t dnode_slots;
 	uint64_t obj_start;
 	uint64_t obj_end;
 	uint64_t flags;
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (numbuf) >= NN_NUMBUF_SZ, "numbuf truncated");
 
 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 	dmu_objset_fast_stat(os, &dds);
 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 
 	print_header = B_TRUE;
 
 	if (dds.dds_type < DMU_OST_NUMTYPES)
 		type = objset_types[dds.dds_type];
 
 	if (dds.dds_type == DMU_OST_META) {
 		dds.dds_creation_txg = TXG_INITIAL;
 		usedobjs = BP_GET_FILL(os->os_rootbp);
 		refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
 		    dd_used_bytes;
 	} else {
 		dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
 	}
 
 	ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
 
 	zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));
 
 	if (verbosity >= 4) {
 		(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
 		(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
 		    sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
 	} else {
 		blkbuf[0] = '\0';
 	}
 
 	dmu_objset_name(os, osname);
 
 	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
 	    "%s, %llu objects%s%s\n",
 	    osname, type, (u_longlong_t)dmu_objset_id(os),
 	    (u_longlong_t)dds.dds_creation_txg,
 	    numbuf, (u_longlong_t)usedobjs, blkbuf,
 	    (dds.dds_inconsistent) ? " (inconsistent)" : "");
 
 	for (i = 0; i < zopt_object_args; i++) {
 		obj_start = zopt_object_ranges[i].zor_obj_start;
 		obj_end = zopt_object_ranges[i].zor_obj_end;
 		flags = zopt_object_ranges[i].zor_flags;
 
 		object = obj_start;
 		if (object == 0 || obj_start == obj_end)
 			dump_object(os, object, verbosity, &print_header, NULL,
 			    flags);
 		else
 			object--;
 
 		while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) &&
 		    object <= obj_end) {
 			dump_object(os, object, verbosity, &print_header, NULL,
 			    flags);
 		}
 	}
 
 	if (zopt_object_args > 0) {
 		(void) printf("\n");
 		return;
 	}
 
 	if (dump_opt['i'] != 0 || verbosity >= 2)
 		dump_intent_log(dmu_objset_zil(os));
 
 	if (dmu_objset_ds(os) != NULL) {
 		dsl_dataset_t *ds = dmu_objset_ds(os);
 		dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
 		if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
 		    !dmu_objset_is_snapshot(os)) {
 			dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist");
 			if (verify_dd_livelist(os) != 0)
 				fatal("livelist is incorrect");
 		}
 
 		if (dsl_dataset_remap_deadlist_exists(ds)) {
 			(void) printf("ds_remap_deadlist:\n");
 			dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist");
 		}
 		count_ds_mos_objects(ds);
 	}
 
 	if (dmu_objset_ds(os) != NULL)
 		dump_bookmarks(os, verbosity);
 
 	if (verbosity < 2)
 		return;
 
 	if (BP_IS_HOLE(os->os_rootbp))
 		return;
 
 	dump_object(os, 0, verbosity, &print_header, NULL, 0);
 	object_count = 0;
 	if (DMU_USERUSED_DNODE(os) != NULL &&
 	    DMU_USERUSED_DNODE(os)->dn_type != 0) {
 		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
 		    NULL, 0);
 		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
 		    NULL, 0);
 	}
 
 	if (DMU_PROJECTUSED_DNODE(os) != NULL &&
 	    DMU_PROJECTUSED_DNODE(os)->dn_type != 0)
 		dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity,
 		    &print_header, NULL, 0);
 
 	object = 0;
 	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
 		dump_object(os, object, verbosity, &print_header, &dnode_slots,
 		    0);
 		object_count++;
 		total_slots_used += dnode_slots;
 		max_slot_used = object + dnode_slots - 1;
 	}
 
 	(void) printf("\n");
 
 	(void) printf("    Dnode slots:\n");
 	(void) printf("\tTotal used:    %10llu\n",
 	    (u_longlong_t)total_slots_used);
 	(void) printf("\tMax used:      %10llu\n",
 	    (u_longlong_t)max_slot_used);
 	(void) printf("\tPercent empty: %10lf\n",
 	    (double)(max_slot_used - total_slots_used)*100 /
 	    (double)max_slot_used);
 	(void) printf("\n");
 
 	if (error != ESRCH) {
 		(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
 		abort();
 	}
 
 	ASSERT3U(object_count, ==, usedobjs);
 
 	if (leaked_objects != 0) {
 		(void) printf("%d potentially leaked objects detected\n",
 		    leaked_objects);
 		leaked_objects = 0;
 	}
 }
 
 static void
 dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
 {
 	time_t timestamp = ub->ub_timestamp;
 
 	(void) printf("%s", header ? header : "");
 	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
 	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
 	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
 	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
 	(void) printf("\ttimestamp = %llu UTC = %s",
 	    (u_longlong_t)ub->ub_timestamp, ctime(&timestamp));
 
 	char blkbuf[BP_SPRINTF_LEN];
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
 	(void) printf("\tbp = %s\n", blkbuf);
 
 	(void) printf("\tmmp_magic = %016llx\n",
 	    (u_longlong_t)ub->ub_mmp_magic);
 	if (MMP_VALID(ub)) {
 		(void) printf("\tmmp_delay = %0llu\n",
 		    (u_longlong_t)ub->ub_mmp_delay);
 		if (MMP_SEQ_VALID(ub))
 			(void) printf("\tmmp_seq = %u\n",
 			    (unsigned int) MMP_SEQ(ub));
 		if (MMP_FAIL_INT_VALID(ub))
 			(void) printf("\tmmp_fail = %u\n",
 			    (unsigned int) MMP_FAIL_INT(ub));
 		if (MMP_INTERVAL_VALID(ub))
 			(void) printf("\tmmp_write = %u\n",
 			    (unsigned int) MMP_INTERVAL(ub));
 		/* After MMP_* to make summarize_uberblock_mmp cleaner */
 		(void) printf("\tmmp_valid = %x\n",
 		    (unsigned int) ub->ub_mmp_config & 0xFF);
 	}
 
 	if (dump_opt['u'] >= 4) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
 		(void) printf("\trootbp = %s\n", blkbuf);
 	}
 	(void) printf("\tcheckpoint_txg = %llu\n",
 	    (u_longlong_t)ub->ub_checkpoint_txg);
 
 	(void) printf("\traidz_reflow state=%u off=%llu\n",
 	    (int)RRSS_GET_STATE(ub),
 	    (u_longlong_t)RRSS_GET_OFFSET(ub));
 
 	(void) printf("%s", footer ? footer : "");
 }
 
 static void
 dump_config(spa_t *spa)
 {
 	dmu_buf_t *db;
 	size_t nvsize = 0;
 	int error = 0;
 
 
 	error = dmu_bonus_hold(spa->spa_meta_objset,
 	    spa->spa_config_object, FTAG, &db);
 
 	if (error == 0) {
 		nvsize = *(uint64_t *)db->db_data;
 		dmu_buf_rele(db, FTAG);
 
 		(void) printf("\nMOS Configuration:\n");
 		dump_packed_nvlist(spa->spa_meta_objset,
 		    spa->spa_config_object, (void *)&nvsize, 1);
 	} else {
 		(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
 		    (u_longlong_t)spa->spa_config_object, error);
 	}
 }
 
 static void
 dump_cachefile(const char *cachefile)
 {
 	int fd;
 	struct stat64 statbuf;
 	char *buf;
 	nvlist_t *config;
 
 	if ((fd = open64(cachefile, O_RDONLY)) < 0) {
 		(void) printf("cannot open '%s': %s\n", cachefile,
 		    strerror(errno));
 		zdb_exit(1);
 	}
 
 	if (fstat64(fd, &statbuf) != 0) {
 		(void) printf("failed to stat '%s': %s\n", cachefile,
 		    strerror(errno));
 		zdb_exit(1);
 	}
 
 	if ((buf = malloc(statbuf.st_size)) == NULL) {
 		(void) fprintf(stderr, "failed to allocate %llu bytes\n",
 		    (u_longlong_t)statbuf.st_size);
 		zdb_exit(1);
 	}
 
 	if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
 		(void) fprintf(stderr, "failed to read %llu bytes\n",
 		    (u_longlong_t)statbuf.st_size);
 		zdb_exit(1);
 	}
 
 	(void) close(fd);
 
 	if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
 		(void) fprintf(stderr, "failed to unpack nvlist\n");
 		zdb_exit(1);
 	}
 
 	free(buf);
 
 	dump_nvlist(config, 0);
 
 	nvlist_free(config);
 }
 
 /*
  * ZFS label nvlist stats
  */
 typedef struct zdb_nvl_stats {
 	int		zns_list_count;
 	int		zns_leaf_count;
 	size_t		zns_leaf_largest;
 	size_t		zns_leaf_total;
 	nvlist_t	*zns_string;
 	nvlist_t	*zns_uint64;
 	nvlist_t	*zns_boolean;
 } zdb_nvl_stats_t;
 
 static void
 collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats)
 {
 	nvlist_t *list, **array;
 	nvpair_t *nvp = NULL;
 	const char *name;
 	uint_t i, items;
 
 	stats->zns_list_count++;
 
 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
 		name = nvpair_name(nvp);
 
 		switch (nvpair_type(nvp)) {
 		case DATA_TYPE_STRING:
 			fnvlist_add_string(stats->zns_string, name,
 			    fnvpair_value_string(nvp));
 			break;
 		case DATA_TYPE_UINT64:
 			fnvlist_add_uint64(stats->zns_uint64, name,
 			    fnvpair_value_uint64(nvp));
 			break;
 		case DATA_TYPE_BOOLEAN:
 			fnvlist_add_boolean(stats->zns_boolean, name);
 			break;
 		case DATA_TYPE_NVLIST:
 			if (nvpair_value_nvlist(nvp, &list) == 0)
 				collect_nvlist_stats(list, stats);
 			break;
 		case DATA_TYPE_NVLIST_ARRAY:
 			if (nvpair_value_nvlist_array(nvp, &array, &items) != 0)
 				break;
 
 			for (i = 0; i < items; i++) {
 				collect_nvlist_stats(array[i], stats);
 
 				/* collect stats on leaf vdev */
 				if (strcmp(name, "children") == 0) {
 					size_t size;
 
 					(void) nvlist_size(array[i], &size,
 					    NV_ENCODE_XDR);
 					stats->zns_leaf_total += size;
 					if (size > stats->zns_leaf_largest)
 						stats->zns_leaf_largest = size;
 					stats->zns_leaf_count++;
 				}
 			}
 			break;
 		default:
 			(void) printf("skip type %d!\n", (int)nvpair_type(nvp));
 		}
 	}
 }
 
 static void
 dump_nvlist_stats(nvlist_t *nvl, size_t cap)
 {
 	zdb_nvl_stats_t stats = { 0 };
 	size_t size, sum = 0, total;
 	size_t noise;
 
 	/* requires nvlist with non-unique names for stat collection */
 	VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0));
 	VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0));
 	VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0));
 	VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR));
 
 	(void) printf("\n\nZFS Label NVList Config Stats:\n");
 
 	VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR));
 	(void) printf("  %d bytes used, %d bytes free (using %4.1f%%)\n\n",
 	    (int)total, (int)(cap - total), 100.0 * total / cap);
 
 	collect_nvlist_stats(nvl, &stats);
 
 	VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR));
 	size -= noise;
 	sum += size;
 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:",
 	    (int)fnvlist_num_pairs(stats.zns_uint64),
 	    (int)size, 100.0 * size / total);
 
 	VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR));
 	size -= noise;
 	sum += size;
 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:",
 	    (int)fnvlist_num_pairs(stats.zns_string),
 	    (int)size, 100.0 * size / total);
 
 	VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR));
 	size -= noise;
 	sum += size;
 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:",
 	    (int)fnvlist_num_pairs(stats.zns_boolean),
 	    (int)size, 100.0 * size / total);
 
 	size = total - sum;	/* treat remainder as nvlist overhead */
 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:",
 	    stats.zns_list_count, (int)size, 100.0 * size / total);
 
 	if (stats.zns_leaf_count > 0) {
 		size_t average = stats.zns_leaf_total / stats.zns_leaf_count;
 
 		(void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:",
 		    stats.zns_leaf_count, (int)average);
 		(void) printf("%24d bytes largest\n",
 		    (int)stats.zns_leaf_largest);
 
 		if (dump_opt['l'] >= 3 && average > 0)
 			(void) printf("  space for %d additional leaf vdevs\n",
 			    (int)((cap - total) / average));
 	}
 	(void) printf("\n");
 
 	nvlist_free(stats.zns_string);
 	nvlist_free(stats.zns_uint64);
 	nvlist_free(stats.zns_boolean);
 }
 
 typedef struct cksum_record {
 	zio_cksum_t cksum;
 	boolean_t labels[VDEV_LABELS];
 	avl_node_t link;
 } cksum_record_t;
 
 static int
 cksum_record_compare(const void *x1, const void *x2)
 {
 	const cksum_record_t *l = (cksum_record_t *)x1;
 	const cksum_record_t *r = (cksum_record_t *)x2;
 	int arraysize = ARRAY_SIZE(l->cksum.zc_word);
 	int difference = 0;
 
 	for (int i = 0; i < arraysize; i++) {
 		difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]);
 		if (difference)
 			break;
 	}
 
 	return (difference);
 }
 
 static cksum_record_t *
 cksum_record_alloc(zio_cksum_t *cksum, int l)
 {
 	cksum_record_t *rec;
 
 	rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL);
 	rec->cksum = *cksum;
 	rec->labels[l] = B_TRUE;
 
 	return (rec);
 }
 
 static cksum_record_t *
 cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum)
 {
 	cksum_record_t lookup = { .cksum = *cksum };
 	avl_index_t where;
 
 	return (avl_find(tree, &lookup, &where));
 }
 
 static cksum_record_t *
 cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l)
 {
 	cksum_record_t *rec;
 
 	rec = cksum_record_lookup(tree, cksum);
 	if (rec) {
 		rec->labels[l] = B_TRUE;
 	} else {
 		rec = cksum_record_alloc(cksum, l);
 		avl_add(tree, rec);
 	}
 
 	return (rec);
 }
 
 static int
 first_label(cksum_record_t *rec)
 {
 	for (int i = 0; i < VDEV_LABELS; i++)
 		if (rec->labels[i])
 			return (i);
 
 	return (-1);
 }
 
 static void
 print_label_numbers(const char *prefix, const cksum_record_t *rec)
 {
 	fputs(prefix, stdout);
 	for (int i = 0; i < VDEV_LABELS; i++)
 		if (rec->labels[i] == B_TRUE)
 			printf("%d ", i);
 	putchar('\n');
 }
 
 #define	MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT)
 
 typedef struct zdb_label {
 	vdev_label_t label;
 	uint64_t label_offset;
 	nvlist_t *config_nv;
 	cksum_record_t *config;
 	cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT];
 	boolean_t header_printed;
 	boolean_t read_failed;
 	boolean_t cksum_valid;
 } zdb_label_t;
 
 static void
 print_label_header(zdb_label_t *label, int l)
 {
 
 	if (dump_opt['q'])
 		return;
 
 	if (label->header_printed == B_TRUE)
 		return;
 
 	(void) printf("------------------------------------\n");
 	(void) printf("LABEL %d %s\n", l,
 	    label->cksum_valid ? "" : "(Bad label cksum)");
 	(void) printf("------------------------------------\n");
 
 	label->header_printed = B_TRUE;
 }
 
 static void
 print_l2arc_header(void)
 {
 	(void) printf("------------------------------------\n");
 	(void) printf("L2ARC device header\n");
 	(void) printf("------------------------------------\n");
 }
 
 static void
 print_l2arc_log_blocks(void)
 {
 	(void) printf("------------------------------------\n");
 	(void) printf("L2ARC device log blocks\n");
 	(void) printf("------------------------------------\n");
 }
 
 static void
 dump_l2arc_log_entries(uint64_t log_entries,
     l2arc_log_ent_phys_t *le, uint64_t i)
 {
 	for (int j = 0; j < log_entries; j++) {
 		dva_t dva = le[j].le_dva;
 		(void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, "
 		    "vdev: %llu, offset: %llu\n",
 		    (u_longlong_t)i, j + 1,
 		    (u_longlong_t)DVA_GET_ASIZE(&dva),
 		    (u_longlong_t)DVA_GET_VDEV(&dva),
 		    (u_longlong_t)DVA_GET_OFFSET(&dva));
 		(void) printf("|\t\t\t\tbirth: %llu\n",
 		    (u_longlong_t)le[j].le_birth);
 		(void) printf("|\t\t\t\tlsize: %llu\n",
 		    (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tpsize: %llu\n",
 		    (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tcompr: %llu\n",
 		    (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tcomplevel: %llu\n",
 		    (u_longlong_t)(&le[j])->le_complevel);
 		(void) printf("|\t\t\t\ttype: %llu\n",
 		    (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tprotected: %llu\n",
 		    (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tprefetch: %llu\n",
 		    (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop));
 		(void) printf("|\t\t\t\taddress: %llu\n",
 		    (u_longlong_t)le[j].le_daddr);
 		(void) printf("|\t\t\t\tARC state: %llu\n",
 		    (u_longlong_t)L2BLK_GET_STATE((&le[j])->le_prop));
 		(void) printf("|\n");
 	}
 	(void) printf("\n");
 }
 
 static void
 dump_l2arc_log_blkptr(const l2arc_log_blkptr_t *lbps)
 {
 	(void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps->lbp_daddr);
 	(void) printf("|\t\tpayload_asize: %llu\n",
 	    (u_longlong_t)lbps->lbp_payload_asize);
 	(void) printf("|\t\tpayload_start: %llu\n",
 	    (u_longlong_t)lbps->lbp_payload_start);
 	(void) printf("|\t\tlsize: %llu\n",
 	    (u_longlong_t)L2BLK_GET_LSIZE(lbps->lbp_prop));
 	(void) printf("|\t\tasize: %llu\n",
 	    (u_longlong_t)L2BLK_GET_PSIZE(lbps->lbp_prop));
 	(void) printf("|\t\tcompralgo: %llu\n",
 	    (u_longlong_t)L2BLK_GET_COMPRESS(lbps->lbp_prop));
 	(void) printf("|\t\tcksumalgo: %llu\n",
 	    (u_longlong_t)L2BLK_GET_CHECKSUM(lbps->lbp_prop));
 	(void) printf("|\n\n");
 }
 
 static void
 dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
     l2arc_dev_hdr_phys_t *rebuild)
 {
 	l2arc_log_blk_phys_t this_lb;
 	uint64_t asize;
 	l2arc_log_blkptr_t lbps[2];
 	zio_cksum_t cksum;
 	int failed = 0;
 	l2arc_dev_t dev;
 
 	if (!dump_opt['q'])
 		print_l2arc_log_blocks();
 	memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps));
 
 	dev.l2ad_evict = l2dhdr->dh_evict;
 	dev.l2ad_start = l2dhdr->dh_start;
 	dev.l2ad_end = l2dhdr->dh_end;
 
 	if (l2dhdr->dh_start_lbps[0].lbp_daddr == 0) {
 		/* no log blocks to read */
 		if (!dump_opt['q']) {
 			(void) printf("No log blocks to read\n");
 			(void) printf("\n");
 		}
 		return;
 	} else {
 		dev.l2ad_hand = lbps[0].lbp_daddr +
 		    L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
 	}
 
 	dev.l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
 
 	for (;;) {
 		if (!l2arc_log_blkptr_valid(&dev, &lbps[0]))
 			break;
 
 		/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 		asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
 		if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) {
 			if (!dump_opt['q']) {
 				(void) printf("Error while reading next log "
 				    "block\n\n");
 			}
 			break;
 		}
 
 		fletcher_4_native_varsize(&this_lb, asize, &cksum);
 		if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) {
 			failed++;
 			if (!dump_opt['q']) {
 				(void) printf("Invalid cksum\n");
 				dump_l2arc_log_blkptr(&lbps[0]);
 			}
 			break;
 		}
 
 		switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
 		case ZIO_COMPRESS_OFF:
 			break;
 		default: {
 			abd_t *abd = abd_alloc_linear(asize, B_TRUE);
 			abd_copy_from_buf_off(abd, &this_lb, 0, asize);
 			abd_t dabd;
 			abd_get_from_buf_struct(&dabd, &this_lb,
 			    sizeof (this_lb));
 			int err = zio_decompress_data(L2BLK_GET_COMPRESS(
 			    (&lbps[0])->lbp_prop), abd, &dabd,
 			    asize, sizeof (this_lb), NULL);
 			abd_free(&dabd);
 			abd_free(abd);
 			if (err != 0) {
 				(void) printf("L2ARC block decompression "
 				    "failed\n");
 				goto out;
 			}
 			break;
 		}
 		}
 
 		if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
 			byteswap_uint64_array(&this_lb, sizeof (this_lb));
 		if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) {
 			if (!dump_opt['q'])
 				(void) printf("Invalid log block magic\n\n");
 			break;
 		}
 
 		rebuild->dh_lb_count++;
 		rebuild->dh_lb_asize += asize;
 		if (dump_opt['l'] > 1 && !dump_opt['q']) {
 			(void) printf("lb[%4llu]\tmagic: %llu\n",
 			    (u_longlong_t)rebuild->dh_lb_count,
 			    (u_longlong_t)this_lb.lb_magic);
 			dump_l2arc_log_blkptr(&lbps[0]);
 		}
 
 		if (dump_opt['l'] > 2 && !dump_opt['q'])
 			dump_l2arc_log_entries(l2dhdr->dh_log_entries,
 			    this_lb.lb_entries,
 			    rebuild->dh_lb_count);
 
 		if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
 		    lbps[0].lbp_payload_start, dev.l2ad_evict) &&
 		    !dev.l2ad_first)
 			break;
 
 		lbps[0] = lbps[1];
 		lbps[1] = this_lb.lb_prev_lbp;
 	}
 out:
 	if (!dump_opt['q']) {
 		(void) printf("log_blk_count:\t %llu with valid cksum\n",
 		    (u_longlong_t)rebuild->dh_lb_count);
 		(void) printf("\t\t %d with invalid cksum\n", failed);
 		(void) printf("log_blk_asize:\t %llu\n\n",
 		    (u_longlong_t)rebuild->dh_lb_asize);
 	}
 }
 
 static int
 dump_l2arc_header(int fd)
 {
 	l2arc_dev_hdr_phys_t l2dhdr = {0}, rebuild = {0};
 	int error = B_FALSE;
 
 	if (pread64(fd, &l2dhdr, sizeof (l2dhdr),
 	    VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) {
 		error = B_TRUE;
 	} else {
 		if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
 			byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr));
 
 		if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC)
 			error = B_TRUE;
 	}
 
 	if (error) {
 		(void) printf("L2ARC device header not found\n\n");
 		/* Do not return an error here for backward compatibility */
 		return (0);
 	} else if (!dump_opt['q']) {
 		print_l2arc_header();
 
 		(void) printf("    magic: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_magic);
 		(void) printf("    version: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_version);
 		(void) printf("    pool_guid: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_spa_guid);
 		(void) printf("    flags: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_flags);
 		(void) printf("    start_lbps[0]: %llu\n",
 		    (u_longlong_t)
 		    l2dhdr.dh_start_lbps[0].lbp_daddr);
 		(void) printf("    start_lbps[1]: %llu\n",
 		    (u_longlong_t)
 		    l2dhdr.dh_start_lbps[1].lbp_daddr);
 		(void) printf("    log_blk_ent: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_log_entries);
 		(void) printf("    start: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_start);
 		(void) printf("    end: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_end);
 		(void) printf("    evict: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_evict);
 		(void) printf("    lb_asize_refcount: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_lb_asize);
 		(void) printf("    lb_count_refcount: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_lb_count);
 		(void) printf("    trim_action_time: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_trim_action_time);
 		(void) printf("    trim_state: %llu\n\n",
 		    (u_longlong_t)l2dhdr.dh_trim_state);
 	}
 
 	dump_l2arc_log_blocks(fd, &l2dhdr, &rebuild);
 	/*
 	 * The total aligned size of log blocks and the number of log blocks
 	 * reported in the header of the device may be less than what zdb
 	 * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild().
 	 * This happens because dump_l2arc_log_blocks() lacks the memory
 	 * pressure valve that l2arc_rebuild() has. Thus, if we are on a system
 	 * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize
 	 * and dh_lb_count will be lower to begin with than what exists on the
 	 * device. This is normal and zdb should not exit with an error. The
 	 * opposite case should never happen though, the values reported in the
 	 * header should never be higher than what dump_l2arc_log_blocks() and
 	 * l2arc_rebuild() report. If this happens there is a leak in the
 	 * accounting of log blocks.
 	 */
 	if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize ||
 	    l2dhdr.dh_lb_count > rebuild.dh_lb_count)
 		return (1);
 
 	return (0);
 }
 
 static void
 dump_config_from_label(zdb_label_t *label, size_t buflen, int l)
 {
 	if (dump_opt['q'])
 		return;
 
 	if ((dump_opt['l'] < 3) && (first_label(label->config) != l))
 		return;
 
 	print_label_header(label, l);
 	dump_nvlist(label->config_nv, 4);
 	print_label_numbers("    labels = ", label->config);
 
 	if (dump_opt['l'] >= 2)
 		dump_nvlist_stats(label->config_nv, buflen);
 }
 
 #define	ZDB_MAX_UB_HEADER_SIZE 32
 
 static void
 dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num)
 {
 
 	vdev_t vd;
 	char header[ZDB_MAX_UB_HEADER_SIZE];
 
 	vd.vdev_ashift = ashift;
 	vd.vdev_top = &vd;
 
 	for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
 		uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
 		uberblock_t *ub = (void *)((char *)&label->label + uoff);
 		cksum_record_t *rec = label->uberblocks[i];
 
 		if (rec == NULL) {
 			if (dump_opt['u'] >= 2) {
 				print_label_header(label, label_num);
 				(void) printf("    Uberblock[%d] invalid\n", i);
 			}
 			continue;
 		}
 
 		if ((dump_opt['u'] < 3) && (first_label(rec) != label_num))
 			continue;
 
 		if ((dump_opt['u'] < 4) &&
 		    (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay &&
 		    (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL))
 			continue;
 
 		print_label_header(label, label_num);
 		(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
 		    "    Uberblock[%d]\n", i);
 		dump_uberblock(ub, header, "");
 		print_label_numbers("        labels = ", rec);
 	}
 }
 
 static char curpath[PATH_MAX];
 
 /*
  * Iterate through the path components, recursively passing
  * current one's obj and remaining path until we find the obj
  * for the last one.
  */
 static int
 dump_path_impl(objset_t *os, uint64_t obj, char *name, uint64_t *retobj)
 {
 	int err;
 	boolean_t header = B_TRUE;
 	uint64_t child_obj;
 	char *s;
 	dmu_buf_t *db;
 	dmu_object_info_t doi;
 
 	if ((s = strchr(name, '/')) != NULL)
 		*s = '\0';
 	err = zap_lookup(os, obj, name, 8, 1, &child_obj);
 
 	(void) strlcat(curpath, name, sizeof (curpath));
 
 	if (err != 0) {
 		(void) fprintf(stderr, "failed to lookup %s: %s\n",
 		    curpath, strerror(err));
 		return (err);
 	}
 
 	child_obj = ZFS_DIRENT_OBJ(child_obj);
 	err = sa_buf_hold(os, child_obj, FTAG, &db);
 	if (err != 0) {
 		(void) fprintf(stderr,
 		    "failed to get SA dbuf for obj %llu: %s\n",
 		    (u_longlong_t)child_obj, strerror(err));
 		return (EINVAL);
 	}
 	dmu_object_info_from_db(db, &doi);
 	sa_buf_rele(db, FTAG);
 
 	if (doi.doi_bonus_type != DMU_OT_SA &&
 	    doi.doi_bonus_type != DMU_OT_ZNODE) {
 		(void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
 		    doi.doi_bonus_type, (u_longlong_t)child_obj);
 		return (EINVAL);
 	}
 
 	if (dump_opt['v'] > 6) {
 		(void) printf("obj=%llu %s type=%d bonustype=%d\n",
 		    (u_longlong_t)child_obj, curpath, doi.doi_type,
 		    doi.doi_bonus_type);
 	}
 
 	(void) strlcat(curpath, "/", sizeof (curpath));
 
 	switch (doi.doi_type) {
 	case DMU_OT_DIRECTORY_CONTENTS:
 		if (s != NULL && *(s + 1) != '\0')
 			return (dump_path_impl(os, child_obj, s + 1, retobj));
 		zfs_fallthrough;
 	case DMU_OT_PLAIN_FILE_CONTENTS:
 		if (retobj != NULL) {
 			*retobj = child_obj;
 		} else {
 			dump_object(os, child_obj, dump_opt['v'], &header,
 			    NULL, 0);
 		}
 		return (0);
 	default:
 		(void) fprintf(stderr, "object %llu has non-file/directory "
 		    "type %d\n", (u_longlong_t)obj, doi.doi_type);
 		break;
 	}
 
 	return (EINVAL);
 }
 
 /*
  * Dump the blocks for the object specified by path inside the dataset.
  */
 static int
 dump_path(char *ds, char *path, uint64_t *retobj)
 {
 	int err;
 	objset_t *os;
 	uint64_t root_obj;
 
 	err = open_objset(ds, FTAG, &os);
 	if (err != 0)
 		return (err);
 
 	err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
 	if (err != 0) {
 		(void) fprintf(stderr, "can't lookup root znode: %s\n",
 		    strerror(err));
 		close_objset(os, FTAG);
 		return (EINVAL);
 	}
 
 	(void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);
 
 	err = dump_path_impl(os, root_obj, path, retobj);
 
 	close_objset(os, FTAG);
 	return (err);
 }
 
 static int
 dump_backup_bytes(objset_t *os, void *buf, int len, void *arg)
 {
 	const char *p = (const char *)buf;
 	ssize_t nwritten;
 
 	(void) os;
 	(void) arg;
 
 	/* Write the data out, handling short writes and signals. */
 	while ((nwritten = write(STDOUT_FILENO, p, len)) < len) {
 		if (nwritten < 0) {
 			if (errno == EINTR)
 				continue;
 			return (errno);
 		}
 		p += nwritten;
 		len -= nwritten;
 	}
 
 	return (0);
 }
 
 static void
 dump_backup(const char *pool, uint64_t objset_id, const char *flagstr)
 {
 	boolean_t embed = B_FALSE;
 	boolean_t large_block = B_FALSE;
 	boolean_t compress = B_FALSE;
 	boolean_t raw = B_FALSE;
 
 	const char *c;
 	for (c = flagstr; c != NULL && *c != '\0'; c++) {
 		switch (*c) {
 			case 'e':
 				embed = B_TRUE;
 				break;
 			case 'L':
 				large_block = B_TRUE;
 				break;
 			case 'c':
 				compress = B_TRUE;
 				break;
 			case 'w':
 				raw = B_TRUE;
 				break;
 			default:
 				fprintf(stderr, "dump_backup: invalid flag "
 				    "'%c'\n", *c);
 				return;
 		}
 	}
 
 	if (isatty(STDOUT_FILENO)) {
 		fprintf(stderr, "dump_backup: stream cannot be written "
 		    "to a terminal\n");
 		return;
 	}
 
 	offset_t off = 0;
 	dmu_send_outparams_t out = {
 	    .dso_outfunc = dump_backup_bytes,
 	    .dso_dryrun  = B_FALSE,
 	};
 
 	int err = dmu_send_obj(pool, objset_id, /* fromsnap */0, embed,
 	    large_block, compress, raw, /* saved */ B_FALSE, STDOUT_FILENO,
 	    &off, &out);
 	if (err != 0) {
 		fprintf(stderr, "dump_backup: dmu_send_obj: %s\n",
 		    strerror(err));
 		return;
 	}
 }
 
 static int
 zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile)
 {
 	int err = 0;
 	uint64_t size, readsize, oursize, offset;
 	ssize_t writesize;
 	sa_handle_t *hdl;
 
 	(void) printf("Copying object %" PRIu64 " to file %s\n", srcobj,
 	    destfile);
 
 	VERIFY3P(os, ==, sa_os);
 	if ((err = sa_handle_get(os, srcobj, NULL, SA_HDL_PRIVATE, &hdl))) {
 		(void) printf("Failed to get handle for SA znode\n");
 		return (err);
 	}
 	if ((err = sa_lookup(hdl, sa_attr_table[ZPL_SIZE], &size, 8))) {
 		(void) sa_handle_destroy(hdl);
 		return (err);
 	}
 	(void) sa_handle_destroy(hdl);
 
 	(void) printf("Object %" PRIu64 " is %" PRIu64 " bytes\n", srcobj,
 	    size);
 	if (size == 0) {
 		return (EINVAL);
 	}
 
 	int fd = open(destfile, O_WRONLY | O_CREAT | O_TRUNC, 0644);
 	if (fd == -1)
 		return (errno);
 	/*
 	 * We cap the size at 1 mebibyte here to prevent
 	 * allocation failures and nigh-infinite printing if the
 	 * object is extremely large.
 	 */
 	oursize = MIN(size, 1 << 20);
 	offset = 0;
 	char *buf = kmem_alloc(oursize, KM_NOSLEEP);
 	if (buf == NULL) {
 		(void) close(fd);
 		return (ENOMEM);
 	}
 
 	while (offset < size) {
 		readsize = MIN(size - offset, 1 << 20);
 		err = dmu_read(os, srcobj, offset, readsize, buf, 0);
 		if (err != 0) {
 			(void) printf("got error %u from dmu_read\n", err);
 			kmem_free(buf, oursize);
 			(void) close(fd);
 			return (err);
 		}
 		if (dump_opt['v'] > 3) {
 			(void) printf("Read offset=%" PRIu64 " size=%" PRIu64
 			    " error=%d\n", offset, readsize, err);
 		}
 
 		writesize = write(fd, buf, readsize);
 		if (writesize < 0) {
 			err = errno;
 			break;
 		} else if (writesize != readsize) {
 			/* Incomplete write */
 			(void) fprintf(stderr, "Short write, only wrote %llu of"
 			    " %" PRIu64 " bytes, exiting...\n",
 			    (u_longlong_t)writesize, readsize);
 			break;
 		}
 
 		offset += readsize;
 	}
 
 	(void) close(fd);
 
 	if (buf != NULL)
 		kmem_free(buf, oursize);
 
 	return (err);
 }
 
 static boolean_t
 label_cksum_valid(vdev_label_t *label, uint64_t offset)
 {
 	zio_checksum_info_t *ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL];
 	zio_cksum_t expected_cksum;
 	zio_cksum_t actual_cksum;
 	zio_cksum_t verifier;
 	zio_eck_t *eck;
 	int byteswap;
 
 	void *data = (char *)label + offsetof(vdev_label_t, vl_vdev_phys);
 	eck = (zio_eck_t *)((char *)(data) + VDEV_PHYS_SIZE) - 1;
 
 	offset += offsetof(vdev_label_t, vl_vdev_phys);
 	ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0);
 
 	byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
 	if (byteswap)
 		byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 
 	expected_cksum = eck->zec_cksum;
 	eck->zec_cksum = verifier;
 
 	abd_t *abd = abd_get_from_buf(data, VDEV_PHYS_SIZE);
 	ci->ci_func[byteswap](abd, VDEV_PHYS_SIZE, NULL, &actual_cksum);
 	abd_free(abd);
 
 	if (byteswap)
 		byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t));
 
 	if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
 		return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 static int
 dump_label(const char *dev)
 {
 	char path[MAXPATHLEN];
 	zdb_label_t labels[VDEV_LABELS] = {{{{0}}}};
 	uint64_t psize, ashift, l2cache;
 	struct stat64 statbuf;
 	boolean_t config_found = B_FALSE;
 	boolean_t error = B_FALSE;
 	boolean_t read_l2arc_header = B_FALSE;
 	avl_tree_t config_tree;
 	avl_tree_t uberblock_tree;
 	void *node, *cookie;
 	int fd;
 
 	/*
 	 * Check if we were given absolute path and use it as is.
 	 * Otherwise if the provided vdev name doesn't point to a file,
 	 * try prepending expected disk paths and partition numbers.
 	 */
 	(void) strlcpy(path, dev, sizeof (path));
 	if (dev[0] != '/' && stat64(path, &statbuf) != 0) {
 		int error;
 
 		error = zfs_resolve_shortname(dev, path, MAXPATHLEN);
 		if (error == 0 && zfs_dev_is_whole_disk(path)) {
 			if (zfs_append_partition(path, MAXPATHLEN) == -1)
 				error = ENOENT;
 		}
 
 		if (error || (stat64(path, &statbuf) != 0)) {
 			(void) printf("failed to find device %s, try "
 			    "specifying absolute path instead\n", dev);
 			return (1);
 		}
 	}
 
 	if ((fd = open64(path, O_RDONLY)) < 0) {
 		(void) printf("cannot open '%s': %s\n", path, strerror(errno));
 		zdb_exit(1);
 	}
 
 	if (fstat64_blk(fd, &statbuf) != 0) {
 		(void) printf("failed to stat '%s': %s\n", path,
 		    strerror(errno));
 		(void) close(fd);
 		zdb_exit(1);
 	}
 
 	if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0)
 		(void) printf("failed to invalidate cache '%s' : %s\n", path,
 		    strerror(errno));
 
 	avl_create(&config_tree, cksum_record_compare,
 	    sizeof (cksum_record_t), offsetof(cksum_record_t, link));
 	avl_create(&uberblock_tree, cksum_record_compare,
 	    sizeof (cksum_record_t), offsetof(cksum_record_t, link));
 
 	psize = statbuf.st_size;
 	psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t);
 	ashift = SPA_MINBLOCKSHIFT;
 
 	/*
 	 * 1. Read the label from disk
 	 * 2. Verify label cksum
 	 * 3. Unpack the configuration and insert in config tree.
 	 * 4. Traverse all uberblocks and insert in uberblock tree.
 	 */
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		zdb_label_t *label = &labels[l];
 		char *buf = label->label.vl_vdev_phys.vp_nvlist;
 		size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
 		nvlist_t *config;
 		cksum_record_t *rec;
 		zio_cksum_t cksum;
 		vdev_t vd;
 
 		label->label_offset = vdev_label_offset(psize, l, 0);
 
 		if (pread64(fd, &label->label, sizeof (label->label),
 		    label->label_offset) != sizeof (label->label)) {
 			if (!dump_opt['q'])
 				(void) printf("failed to read label %d\n", l);
 			label->read_failed = B_TRUE;
 			error = B_TRUE;
 			continue;
 		}
 
 		label->read_failed = B_FALSE;
 		label->cksum_valid = label_cksum_valid(&label->label,
 		    label->label_offset);
 
 		if (nvlist_unpack(buf, buflen, &config, 0) == 0) {
 			nvlist_t *vdev_tree = NULL;
 			size_t size;
 
 			if ((nvlist_lookup_nvlist(config,
 			    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
 			    (nvlist_lookup_uint64(vdev_tree,
 			    ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
 				ashift = SPA_MINBLOCKSHIFT;
 
 			if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0)
 				size = buflen;
 
 			/* If the device is a cache device read the header. */
 			if (!read_l2arc_header) {
 				if (nvlist_lookup_uint64(config,
 				    ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 &&
 				    l2cache == POOL_STATE_L2CACHE) {
 					read_l2arc_header = B_TRUE;
 				}
 			}
 
 			fletcher_4_native_varsize(buf, size, &cksum);
 			rec = cksum_record_insert(&config_tree, &cksum, l);
 
 			label->config = rec;
 			label->config_nv = config;
 			config_found = B_TRUE;
 		} else {
 			error = B_TRUE;
 		}
 
 		vd.vdev_ashift = ashift;
 		vd.vdev_top = &vd;
 
 		for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
 			uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
 			uberblock_t *ub = (void *)((char *)label + uoff);
 
 			if (uberblock_verify(ub))
 				continue;
 
 			fletcher_4_native_varsize(ub, sizeof (*ub), &cksum);
 			rec = cksum_record_insert(&uberblock_tree, &cksum, l);
 
 			label->uberblocks[i] = rec;
 		}
 	}
 
 	/*
 	 * Dump the label and uberblocks.
 	 */
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		zdb_label_t *label = &labels[l];
 		size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
 
 		if (label->read_failed == B_TRUE)
 			continue;
 
 		if (label->config_nv) {
 			dump_config_from_label(label, buflen, l);
 		} else {
 			if (!dump_opt['q'])
 				(void) printf("failed to unpack label %d\n", l);
 		}
 
 		if (dump_opt['u'])
 			dump_label_uberblocks(label, ashift, l);
 
 		nvlist_free(label->config_nv);
 	}
 
 	/*
 	 * Dump the L2ARC header, if existent.
 	 */
 	if (read_l2arc_header)
 		error |= dump_l2arc_header(fd);
 
 	cookie = NULL;
 	while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL)
 		umem_free(node, sizeof (cksum_record_t));
 
 	cookie = NULL;
 	while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL)
 		umem_free(node, sizeof (cksum_record_t));
 
 	avl_destroy(&config_tree);
 	avl_destroy(&uberblock_tree);
 
 	(void) close(fd);
 
 	return (config_found == B_FALSE ? 2 :
 	    (error == B_TRUE ? 1 : 0));
 }
 
 static uint64_t dataset_feature_count[SPA_FEATURES];
 static uint64_t global_feature_count[SPA_FEATURES];
 static uint64_t remap_deadlist_count = 0;
 
 static int
 dump_one_objset(const char *dsname, void *arg)
 {
 	(void) arg;
 	int error;
 	objset_t *os;
 	spa_feature_t f;
 
 	error = open_objset(dsname, FTAG, &os);
 	if (error != 0)
 		return (0);
 
 	for (f = 0; f < SPA_FEATURES; f++) {
 		if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f))
 			continue;
 		ASSERT(spa_feature_table[f].fi_flags &
 		    ZFEATURE_FLAG_PER_DATASET);
 		dataset_feature_count[f]++;
 	}
 
 	if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) {
 		remap_deadlist_count++;
 	}
 
 	for (dsl_bookmark_node_t *dbn =
 	    avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL;
 	    dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) {
 		mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj);
 		if (dbn->dbn_phys.zbm_redaction_obj != 0) {
 			global_feature_count[
 			    SPA_FEATURE_REDACTION_BOOKMARKS]++;
 			objset_t *mos = os->os_spa->spa_meta_objset;
 			dnode_t *rl;
 			VERIFY0(dnode_hold(mos,
 			    dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl));
 			if (rl->dn_have_spill) {
 				global_feature_count[
 				    SPA_FEATURE_REDACTION_LIST_SPILL]++;
 			}
 		}
 		if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)
 			global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++;
 	}
 
 	if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) &&
 	    !dmu_objset_is_snapshot(os)) {
 		global_feature_count[SPA_FEATURE_LIVELIST]++;
 	}
 
 	dump_objset(os);
 	close_objset(os, FTAG);
 	fuid_table_destroy();
 	return (0);
 }
 
 /*
  * Block statistics.
  */
 #define	PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
 typedef struct zdb_blkstats {
 	uint64_t zb_asize;
 	uint64_t zb_lsize;
 	uint64_t zb_psize;
 	uint64_t zb_count;
 	uint64_t zb_gangs;
 	uint64_t zb_ditto_samevdev;
 	uint64_t zb_ditto_same_ms;
 	uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
 } zdb_blkstats_t;
 
 /*
  * Extended object types to report deferred frees and dedup auto-ditto blocks.
  */
 #define	ZDB_OT_DEFERRED	(DMU_OT_NUMTYPES + 0)
 #define	ZDB_OT_DITTO	(DMU_OT_NUMTYPES + 1)
 #define	ZDB_OT_OTHER	(DMU_OT_NUMTYPES + 2)
 #define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 3)
 
 static const char *zdb_ot_extname[] = {
 	"deferred free",
 	"dedup ditto",
 	"other",
 	"Total",
 };
 
 #define	ZB_TOTAL	DN_MAX_LEVELS
 #define	SPA_MAX_FOR_16M	(SPA_MAXBLOCKSHIFT+1)
 
 typedef struct zdb_brt_entry {
 	dva_t		zbre_dva;
 	uint64_t	zbre_refcount;
 	avl_node_t	zbre_node;
 } zdb_brt_entry_t;
 
 typedef struct zdb_cb {
 	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
 	uint64_t	zcb_removing_size;
 	uint64_t	zcb_checkpoint_size;
 	uint64_t	zcb_dedup_asize;
 	uint64_t	zcb_dedup_blocks;
 	uint64_t	zcb_clone_asize;
 	uint64_t	zcb_clone_blocks;
 	uint64_t	zcb_psize_count[SPA_MAX_FOR_16M];
 	uint64_t	zcb_lsize_count[SPA_MAX_FOR_16M];
 	uint64_t	zcb_asize_count[SPA_MAX_FOR_16M];
 	uint64_t	zcb_psize_len[SPA_MAX_FOR_16M];
 	uint64_t	zcb_lsize_len[SPA_MAX_FOR_16M];
 	uint64_t	zcb_asize_len[SPA_MAX_FOR_16M];
 	uint64_t	zcb_psize_total;
 	uint64_t	zcb_lsize_total;
 	uint64_t	zcb_asize_total;
 	uint64_t	zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
 	uint64_t	zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
 	    [BPE_PAYLOAD_SIZE + 1];
 	uint64_t	zcb_start;
 	hrtime_t	zcb_lastprint;
 	uint64_t	zcb_totalasize;
 	uint64_t	zcb_errors[256];
 	int		zcb_readfails;
 	int		zcb_haderrors;
 	spa_t		*zcb_spa;
 	uint32_t	**zcb_vd_obsolete_counts;
 	avl_tree_t	zcb_brt;
 	boolean_t	zcb_brt_is_active;
 } zdb_cb_t;
 
 /* test if two DVA offsets from same vdev are within the same metaslab */
 static boolean_t
 same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2)
 {
 	vdev_t *vd = vdev_lookup_top(spa, vdev);
 	uint64_t ms_shift = vd->vdev_ms_shift;
 
 	return ((off1 >> ms_shift) == (off2 >> ms_shift));
 }
 
 /*
  * Used to simplify reporting of the histogram data.
  */
 typedef struct one_histo {
 	const char *name;
 	uint64_t *count;
 	uint64_t *len;
 	uint64_t cumulative;
 } one_histo_t;
 
 /*
  * The number of separate histograms processed for psize, lsize and asize.
  */
 #define	NUM_HISTO 3
 
 /*
  * This routine will create a fixed column size output of three different
  * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M
  * the count, length and cumulative length of the psize, lsize and
  * asize blocks.
  *
  * All three types of blocks are listed on a single line
  *
  * By default the table is printed in nicenumber format (e.g. 123K) but
  * if the '-P' parameter is specified then the full raw number (parseable)
  * is printed out.
  */
 static void
 dump_size_histograms(zdb_cb_t *zcb)
 {
 	/*
 	 * A temporary buffer that allows us to convert a number into
 	 * a string using zdb_nicenumber to allow either raw or human
 	 * readable numbers to be output.
 	 */
 	char numbuf[32];
 
 	/*
 	 * Define titles which are used in the headers of the tables
 	 * printed by this routine.
 	 */
 	const char blocksize_title1[] = "block";
 	const char blocksize_title2[] = "size";
 	const char count_title[] = "Count";
 	const char length_title[] = "Size";
 	const char cumulative_title[] = "Cum.";
 
 	/*
 	 * Setup the histogram arrays (psize, lsize, and asize).
 	 */
 	one_histo_t parm_histo[NUM_HISTO];
 
 	parm_histo[0].name = "psize";
 	parm_histo[0].count = zcb->zcb_psize_count;
 	parm_histo[0].len = zcb->zcb_psize_len;
 	parm_histo[0].cumulative = 0;
 
 	parm_histo[1].name = "lsize";
 	parm_histo[1].count = zcb->zcb_lsize_count;
 	parm_histo[1].len = zcb->zcb_lsize_len;
 	parm_histo[1].cumulative = 0;
 
 	parm_histo[2].name = "asize";
 	parm_histo[2].count = zcb->zcb_asize_count;
 	parm_histo[2].len = zcb->zcb_asize_len;
 	parm_histo[2].cumulative = 0;
 
 
 	(void) printf("\nBlock Size Histogram\n");
 	/*
 	 * Print the first line titles
 	 */
 	if (dump_opt['P'])
 		(void) printf("\n%s\t", blocksize_title1);
 	else
 		(void) printf("\n%7s   ", blocksize_title1);
 
 	for (int j = 0; j < NUM_HISTO; j++) {
 		if (dump_opt['P']) {
 			if (j < NUM_HISTO - 1) {
 				(void) printf("%s\t\t\t", parm_histo[j].name);
 			} else {
 				/* Don't print trailing spaces */
 				(void) printf("  %s", parm_histo[j].name);
 			}
 		} else {
 			if (j < NUM_HISTO - 1) {
 				/* Left aligned strings in the output */
 				(void) printf("%-7s              ",
 				    parm_histo[j].name);
 			} else {
 				/* Don't print trailing spaces */
 				(void) printf("%s", parm_histo[j].name);
 			}
 		}
 	}
 	(void) printf("\n");
 
 	/*
 	 * Print the second line titles
 	 */
 	if (dump_opt['P']) {
 		(void) printf("%s\t", blocksize_title2);
 	} else {
 		(void) printf("%7s ", blocksize_title2);
 	}
 
 	for (int i = 0; i < NUM_HISTO; i++) {
 		if (dump_opt['P']) {
 			(void) printf("%s\t%s\t%s\t",
 			    count_title, length_title, cumulative_title);
 		} else {
 			(void) printf("%7s%7s%7s",
 			    count_title, length_title, cumulative_title);
 		}
 	}
 	(void) printf("\n");
 
 	/*
 	 * Print the rows
 	 */
 	for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) {
 
 		/*
 		 * Print the first column showing the blocksize
 		 */
 		zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf));
 
 		if (dump_opt['P']) {
 			printf("%s", numbuf);
 		} else {
 			printf("%7s:", numbuf);
 		}
 
 		/*
 		 * Print the remaining set of 3 columns per size:
 		 * for psize, lsize and asize
 		 */
 		for (int j = 0; j < NUM_HISTO; j++) {
 			parm_histo[j].cumulative += parm_histo[j].len[i];
 
 			zdb_nicenum(parm_histo[j].count[i],
 			    numbuf, sizeof (numbuf));
 			if (dump_opt['P'])
 				(void) printf("\t%s", numbuf);
 			else
 				(void) printf("%7s", numbuf);
 
 			zdb_nicenum(parm_histo[j].len[i],
 			    numbuf, sizeof (numbuf));
 			if (dump_opt['P'])
 				(void) printf("\t%s", numbuf);
 			else
 				(void) printf("%7s", numbuf);
 
 			zdb_nicenum(parm_histo[j].cumulative,
 			    numbuf, sizeof (numbuf));
 			if (dump_opt['P'])
 				(void) printf("\t%s", numbuf);
 			else
 				(void) printf("%7s", numbuf);
 		}
 		(void) printf("\n");
 	}
 }
 
 static void
 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
     dmu_object_type_t type)
 {
 	int i;
 
 	ASSERT(type < ZDB_OT_TOTAL);
 
 	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
 		return;
 
 	/*
 	 * This flag controls if we will issue a claim for the block while
 	 * counting it, to ensure that all blocks are referenced in space maps.
 	 * We don't issue claims if we're not doing leak tracking, because it's
 	 * expensive if the user isn't interested. We also don't claim the
 	 * second or later occurences of cloned or dedup'd blocks, because we
 	 * already claimed them the first time.
 	 */
 	boolean_t do_claim = !dump_opt['L'];
 
 	spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
 
 	blkptr_t tempbp;
 	if (BP_GET_DEDUP(bp)) {
 		/*
 		 * Dedup'd blocks are special. We need to count them, so we can
 		 * later uncount them when reporting leaked space, and we must
 		 * only claim them once.
 		 *
 		 * We use the existing dedup system to track what we've seen.
 		 * The first time we see a block, we do a ddt_lookup() to see
 		 * if it exists in the DDT. If we're doing leak tracking, we
 		 * claim the block at this time.
 		 *
 		 * Each time we see a block, we reduce the refcount in the
 		 * entry by one, and add to the size and count of dedup'd
 		 * blocks to report at the end.
 		 */
 
 		ddt_t *ddt = ddt_select(zcb->zcb_spa, bp);
 
 		ddt_enter(ddt);
 
 		/*
 		 * Find the block. This will create the entry in memory, but
 		 * we'll know if that happened by its refcount.
 		 */
 		ddt_entry_t *dde = ddt_lookup(ddt, bp);
 
 		/*
 		 * ddt_lookup() can return NULL if this block didn't exist
 		 * in the DDT and creating it would take the DDT over its
 		 * quota. Since we got the block from disk, it must exist in
 		 * the DDT, so this can't happen. However, when unique entries
 		 * are pruned, the dedup bit can be set with no corresponding
 		 * entry in the DDT.
 		 */
 		if (dde == NULL) {
 			ddt_exit(ddt);
 			goto skipped;
 		}
 
 		/* Get the phys for this variant */
 		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
 
 		/*
 		 * This entry may have multiple sets of DVAs. We must claim
 		 * each set the first time we see them in a real block on disk,
 		 * or count them on subsequent occurences. We don't have a
 		 * convenient way to track the first time we see each variant,
 		 * so we repurpose dde_io as a set of "seen" flag bits. We can
 		 * do this safely in zdb because it never writes, so it will
 		 * never have a writing zio for this block in that pointer.
 		 */
 		boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v));
 		if (!seen)
 			dde->dde_io =
 			    (void *)(((uintptr_t)dde->dde_io) | (1 << v));
 
 		/* Consume a reference for this block. */
 		if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0)
 			ddt_phys_decref(dde->dde_phys, v);
 
 		/*
 		 * If this entry has a single flat phys, it may have been
 		 * extended with additional DVAs at some time in its life.
 		 * This block might be from before it was fully extended, and
 		 * so have fewer DVAs.
 		 *
 		 * If this is the first time we've seen this block, and we
 		 * claimed it as-is, then we would miss the claim on some
 		 * number of DVAs, which would then be seen as leaked.
 		 *
 		 * In all cases, if we've had fewer DVAs, then the asize would
 		 * be too small, and would lead to the pool apparently using
 		 * more space than allocated.
 		 *
 		 * To handle this, we copy the canonical set of DVAs from the
 		 * entry back to the block pointer before we claim it.
 		 */
 		if (v == DDT_PHYS_FLAT) {
 			ASSERT3U(BP_GET_BIRTH(bp), ==,
 			    ddt_phys_birth(dde->dde_phys, v));
 			tempbp = *bp;
 			ddt_bp_fill(dde->dde_phys, v, &tempbp,
 			    BP_GET_BIRTH(bp));
 			bp = &tempbp;
 		}
 
 		if (seen) {
 			/*
 			 * The second or later time we see this block,
 			 * it's a duplicate and we count it.
 			 */
 			zcb->zcb_dedup_asize += BP_GET_ASIZE(bp);
 			zcb->zcb_dedup_blocks++;
 
 			/* Already claimed, don't do it again. */
 			do_claim = B_FALSE;
 		}
 
 		ddt_exit(ddt);
 	} else if (zcb->zcb_brt_is_active &&
 	    brt_maybe_exists(zcb->zcb_spa, bp)) {
 		/*
 		 * Cloned blocks are special. We need to count them, so we can
 		 * later uncount them when reporting leaked space, and we must
 		 * only claim them once.
 		 *
 		 * To do this, we keep our own in-memory BRT. For each block
 		 * we haven't seen before, we look it up in the real BRT and
 		 * if its there, we note it and its refcount then proceed as
 		 * normal. If we see the block again, we count it as a clone
 		 * and then give it no further consideration.
 		 */
 		zdb_brt_entry_t zbre_search, *zbre;
 		avl_index_t where;
 
 		zbre_search.zbre_dva = bp->blk_dva[0];
 		zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
 		if (zbre == NULL) {
 			/* Not seen before; track it */
 			uint64_t refcnt =
 			    brt_entry_get_refcount(zcb->zcb_spa, bp);
 			if (refcnt > 0) {
 				zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
 				    UMEM_NOFAIL);
 				zbre->zbre_dva = bp->blk_dva[0];
 				zbre->zbre_refcount = refcnt;
 				avl_insert(&zcb->zcb_brt, zbre, where);
 			}
 		} else  {
 			/*
 			 * Second or later occurrence, count it and take a
 			 * refcount.
 			 */
 			zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
 			zcb->zcb_clone_blocks++;
 
 			zbre->zbre_refcount--;
 			if (zbre->zbre_refcount == 0) {
 				avl_remove(&zcb->zcb_brt, zbre);
 				umem_free(zbre, sizeof (zdb_brt_entry_t));
 			}
 
 			/* Already claimed, don't do it again. */
 			do_claim = B_FALSE;
 		}
 	}
 
 skipped:
 	for (i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
 		int t = (i & 1) ? type : ZDB_OT_TOTAL;
 		int equal;
 		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
 
 		zb->zb_asize += BP_GET_ASIZE(bp);
 		zb->zb_lsize += BP_GET_LSIZE(bp);
 		zb->zb_psize += BP_GET_PSIZE(bp);
 		zb->zb_count++;
 
 		/*
 		 * The histogram is only big enough to record blocks up to
 		 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
 		 * "other", bucket.
 		 */
 		unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
 		idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
 		zb->zb_psize_histogram[idx]++;
 
 		zb->zb_gangs += BP_COUNT_GANG(bp);
 
 		switch (BP_GET_NDVAS(bp)) {
 		case 2:
 			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1])) {
 				zb->zb_ditto_samevdev++;
 
 				if (same_metaslab(zcb->zcb_spa,
 				    DVA_GET_VDEV(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[1])))
 					zb->zb_ditto_same_ms++;
 			}
 			break;
 		case 3:
 			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2]));
 			if (equal != 0) {
 				zb->zb_ditto_samevdev++;
 
 				if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 				    DVA_GET_VDEV(&bp->blk_dva[1]) &&
 				    same_metaslab(zcb->zcb_spa,
 				    DVA_GET_VDEV(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[1])))
 					zb->zb_ditto_same_ms++;
 				else if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 				    DVA_GET_VDEV(&bp->blk_dva[2]) &&
 				    same_metaslab(zcb->zcb_spa,
 				    DVA_GET_VDEV(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[2])))
 					zb->zb_ditto_same_ms++;
 				else if (DVA_GET_VDEV(&bp->blk_dva[1]) ==
 				    DVA_GET_VDEV(&bp->blk_dva[2]) &&
 				    same_metaslab(zcb->zcb_spa,
 				    DVA_GET_VDEV(&bp->blk_dva[1]),
 				    DVA_GET_OFFSET(&bp->blk_dva[1]),
 				    DVA_GET_OFFSET(&bp->blk_dva[2])))
 					zb->zb_ditto_same_ms++;
 			}
 			break;
 		}
 	}
 
 	spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);
 
 	if (BP_IS_EMBEDDED(bp)) {
 		zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
 		zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
 		    [BPE_GET_PSIZE(bp)]++;
 		return;
 	}
 	/*
 	 * The binning histogram bins by powers of two up to
 	 * SPA_MAXBLOCKSIZE rather than creating bins for
 	 * every possible blocksize found in the pool.
 	 */
 	int bin = highbit64(BP_GET_PSIZE(bp)) - 1;
 
 	zcb->zcb_psize_count[bin]++;
 	zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp);
 	zcb->zcb_psize_total += BP_GET_PSIZE(bp);
 
 	bin = highbit64(BP_GET_LSIZE(bp)) - 1;
 
 	zcb->zcb_lsize_count[bin]++;
 	zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp);
 	zcb->zcb_lsize_total += BP_GET_LSIZE(bp);
 
 	bin = highbit64(BP_GET_ASIZE(bp)) - 1;
 
 	zcb->zcb_asize_count[bin]++;
 	zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
 	zcb->zcb_asize_total += BP_GET_ASIZE(bp);
 
 	if (!do_claim)
 		return;
 
 	VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa,
 	    spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL,
 	    ZIO_FLAG_CANFAIL)));
 }
 
 static void
 zdb_blkptr_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	int ioerr = zio->io_error;
 	zdb_cb_t *zcb = zio->io_private;
 	zbookmark_phys_t *zb = &zio->io_bookmark;
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
 	cv_broadcast(&spa->spa_scrub_io_cv);
 
 	if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 		char blkbuf[BP_SPRINTF_LEN];
 
 		zcb->zcb_haderrors = 1;
 		zcb->zcb_errors[ioerr]++;
 
 		if (dump_opt['b'] >= 2)
 			snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		else
 			blkbuf[0] = '\0';
 
 		(void) printf("zdb_blkptr_cb: "
 		    "Got error %d reading "
 		    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
 		    ioerr,
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)zb->zb_object,
 		    (u_longlong_t)zb->zb_level,
 		    (u_longlong_t)zb->zb_blkid,
 		    blkbuf);
 	}
 	mutex_exit(&spa->spa_scrub_lock);
 
 	abd_free(zio->io_abd);
 }
 
 static int
 zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	zdb_cb_t *zcb = arg;
 	dmu_object_type_t type;
 	boolean_t is_metadata;
 
 	if (zb->zb_level == ZB_DNODE_LEVEL)
 		return (0);
 
 	if (dump_opt['b'] >= 5 && BP_GET_LOGICAL_BIRTH(bp) > 0) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("objset %llu object %llu "
 		    "level %lld offset 0x%llx %s\n",
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)zb->zb_object,
 		    (longlong_t)zb->zb_level,
 		    (u_longlong_t)blkid2offset(dnp, bp, zb),
 		    blkbuf);
 	}
 
 	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
 		return (0);
 
 	type = BP_GET_TYPE(bp);
 
 	zdb_count_block(zcb, zilog, bp,
 	    (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
 
 	is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
 
 	if (!BP_IS_EMBEDDED(bp) &&
 	    (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
 		size_t size = BP_GET_PSIZE(bp);
 		abd_t *abd = abd_alloc(size, B_FALSE);
 		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
 
 		/* If it's an intent log block, failure is expected. */
 		if (zb->zb_level == ZB_ZIL_LEVEL)
 			flags |= ZIO_FLAG_SPECULATIVE;
 
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_load_verify_bytes > max_inflight_bytes)
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 		spa->spa_load_verify_bytes += size;
 		mutex_exit(&spa->spa_scrub_lock);
 
 		zio_nowait(zio_read(NULL, spa, bp, abd, size,
 		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
 	}
 
 	zcb->zcb_readfails = 0;
 
 	/* only call gethrtime() every 100 blocks */
 	static int iters;
 	if (++iters > 100)
 		iters = 0;
 	else
 		return (0);
 
 	if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
 		uint64_t now = gethrtime();
 		char buf[10];
 		uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
 		uint64_t kb_per_sec =
 		    1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
 		uint64_t sec_remaining =
 		    (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
 
 		/* make sure nicenum has enough space */
 		_Static_assert(sizeof (buf) >= NN_NUMBUF_SZ, "buf truncated");
 
 		zfs_nicebytes(bytes, buf, sizeof (buf));
 		(void) fprintf(stderr,
 		    "\r%5s completed (%4"PRIu64"MB/s) "
 		    "estimated time remaining: "
 		    "%"PRIu64"hr %02"PRIu64"min %02"PRIu64"sec        ",
 		    buf, kb_per_sec / 1024,
 		    sec_remaining / 60 / 60,
 		    sec_remaining / 60 % 60,
 		    sec_remaining % 60);
 
 		zcb->zcb_lastprint = now;
 	}
 
 	return (0);
 }
 
 static void
 zdb_leak(void *arg, uint64_t start, uint64_t size)
 {
 	vdev_t *vd = arg;
 
 	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
 }
 
 static metaslab_ops_t zdb_metaslab_ops = {
 	NULL	/* alloc */
 };
 
 static int
 load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme,
     uint64_t txg, void *arg)
 {
 	spa_vdev_removal_t *svr = arg;
 
 	uint64_t offset = sme->sme_offset;
 	uint64_t size = sme->sme_run;
 
 	/* skip vdevs we don't care about */
 	if (sme->sme_vdev != svr->svr_vdev_id)
 		return (0);
 
 	vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev);
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 	if (sme->sme_type == SM_ALLOC)
 		zfs_range_tree_add(svr->svr_allocd_segs, offset, size);
 	else
 		zfs_range_tree_remove(svr->svr_allocd_segs, offset, size);
 
 	return (0);
 }
 
 static void
 claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	(void) inner_offset, (void) arg;
 
 	/*
 	 * This callback was called through a remap from
 	 * a device being removed. Therefore, the vdev that
 	 * this callback is applied to is a concrete
 	 * vdev.
 	 */
 	ASSERT(vdev_is_concrete(vd));
 
 	VERIFY0(metaslab_claim_impl(vd, offset, size,
 	    spa_min_claim_txg(vd->vdev_spa)));
 }
 
 static void
 claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
 {
 	vdev_t *vd = arg;
 
 	vdev_indirect_ops.vdev_op_remap(vd, offset, size,
 	    claim_segment_impl_cb, NULL);
 }
 
 /*
  * After accounting for all allocated blocks that are directly referenced,
  * we might have missed a reference to a block from a partially complete
  * (and thus unused) indirect mapping object. We perform a secondary pass
  * through the metaslabs we have already mapped and claim the destination
  * blocks.
  */
 static void
 zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
 {
 	if (dump_opt['L'])
 		return;
 
 	if (spa->spa_vdev_removal == NULL)
 		return;
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 	ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs));
 
 	zfs_range_tree_t *allocs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
 	    NULL, 0, 0);
 	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
 		metaslab_t *msp = vd->vdev_ms[msi];
 
 		ASSERT0(zfs_range_tree_space(allocs));
 		if (msp->ms_sm != NULL)
 			VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC));
 		zfs_range_tree_vacate(allocs, zfs_range_tree_add,
 		    svr->svr_allocd_segs);
 	}
 	zfs_range_tree_destroy(allocs);
 
 	iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr);
 
 	/*
 	 * Clear everything past what has been synced,
 	 * because we have not allocated mappings for
 	 * it yet.
 	 */
 	zfs_range_tree_clear(svr->svr_allocd_segs,
 	    vdev_indirect_mapping_max_offset(vim),
 	    vd->vdev_asize - vdev_indirect_mapping_max_offset(vim));
 
 	zcb->zcb_removing_size += zfs_range_tree_space(svr->svr_allocd_segs);
 	zfs_range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 static int
 increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	(void) tx;
 	zdb_cb_t *zcb = arg;
 	spa_t *spa = zcb->zcb_spa;
 	vdev_t *vd;
 	const dva_t *dva = &bp->blk_dva[0];
 
 	ASSERT(!bp_freed);
 	ASSERT(!dump_opt['L']);
 	ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva));
 	ASSERT3P(vd, !=, NULL);
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
 	ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL);
 
 	vdev_indirect_mapping_increment_obsolete_count(
 	    vd->vdev_indirect_mapping,
 	    DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva),
 	    zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
 
 	return (0);
 }
 
 static uint32_t *
 zdb_load_obsolete_counts(vdev_t *vd)
 {
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	spa_t *spa = vd->vdev_spa;
 	spa_condensing_indirect_phys_t *scip =
 	    &spa->spa_condensing_indirect_phys;
 	uint64_t obsolete_sm_object;
 	uint32_t *counts;
 
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL);
 	counts = vdev_indirect_mapping_load_obsolete_counts(vim);
 	if (vd->vdev_obsolete_sm != NULL) {
 		vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
 		    vd->vdev_obsolete_sm);
 	}
 	if (scip->scip_vdev == vd->vdev_id &&
 	    scip->scip_prev_obsolete_sm_object != 0) {
 		space_map_t *prev_obsolete_sm = NULL;
 		VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
 		    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
 		vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
 		    prev_obsolete_sm);
 		space_map_close(prev_obsolete_sm);
 	}
 	return (counts);
 }
 
 typedef struct checkpoint_sm_exclude_entry_arg {
 	vdev_t *cseea_vd;
 	uint64_t cseea_checkpoint_size;
 } checkpoint_sm_exclude_entry_arg_t;
 
 static int
 checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
 {
 	checkpoint_sm_exclude_entry_arg_t *cseea = arg;
 	vdev_t *vd = cseea->cseea_vd;
 	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
 	uint64_t end = sme->sme_offset + sme->sme_run;
 
 	ASSERT(sme->sme_type == SM_FREE);
 
 	/*
 	 * Since the vdev_checkpoint_sm exists in the vdev level
 	 * and the ms_sm space maps exist in the metaslab level,
 	 * an entry in the checkpoint space map could theoretically
 	 * cross the boundaries of the metaslab that it belongs.
 	 *
 	 * In reality, because of the way that we populate and
 	 * manipulate the checkpoint's space maps currently,
 	 * there shouldn't be any entries that cross metaslabs.
 	 * Hence the assertion below.
 	 *
 	 * That said, there is no fundamental requirement that
 	 * the checkpoint's space map entries should not cross
 	 * metaslab boundaries. So if needed we could add code
 	 * that handles metaslab-crossing segments in the future.
 	 */
 	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
 	 * By removing the entry from the allocated segments we
 	 * also verify that the entry is there to begin with.
 	 */
 	mutex_enter(&ms->ms_lock);
 	zfs_range_tree_remove(ms->ms_allocatable, sme->sme_offset,
 	    sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
 	cseea->cseea_checkpoint_size += sme->sme_run;
 	return (0);
 }
 
 static void
 zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
 {
 	spa_t *spa = vd->vdev_spa;
 	space_map_t *checkpoint_sm = NULL;
 	uint64_t checkpoint_sm_obj;
 
 	/*
 	 * If there is no vdev_top_zap, we are in a pool whose
 	 * version predates the pool checkpoint feature.
 	 */
 	if (vd->vdev_top_zap == 0)
 		return;
 
 	/*
 	 * If there is no reference of the vdev_checkpoint_sm in
 	 * the vdev_top_zap, then one of the following scenarios
 	 * is true:
 	 *
 	 * 1] There is no checkpoint
 	 * 2] There is a checkpoint, but no checkpointed blocks
 	 *    have been freed yet
 	 * 3] The current vdev is indirect
 	 *
 	 * In these cases we return immediately.
 	 */
 	if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
 		return;
 
 	VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
 	    &checkpoint_sm_obj));
 
 	checkpoint_sm_exclude_entry_arg_t cseea;
 	cseea.cseea_vd = vd;
 	cseea.cseea_checkpoint_size = 0;
 
 	VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
 	    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
 
 	VERIFY0(space_map_iterate(checkpoint_sm,
 	    space_map_length(checkpoint_sm),
 	    checkpoint_sm_exclude_entry_cb, &cseea));
 	space_map_close(checkpoint_sm);
 
 	zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
 }
 
 static void
 zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
 {
 	ASSERT(!dump_opt['L']);
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
 		zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
 	}
 }
 
 static int
 count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme,
     uint64_t txg, void *arg)
 {
 	int64_t *ualloc_space = arg;
 
 	uint64_t offset = sme->sme_offset;
 	uint64_t vdev_id = sme->sme_vdev;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 	if (sme->sme_type == SM_ALLOC)
 		*ualloc_space += sme->sme_run;
 	else
 		*ualloc_space -= sme->sme_run;
 
 	return (0);
 }
 
 static int64_t
 get_unflushed_alloc_space(spa_t *spa)
 {
 	if (dump_opt['L'])
 		return (0);
 
 	int64_t ualloc_space = 0;
 	iterate_through_spacemap_logs(spa, count_unflushed_space_cb,
 	    &ualloc_space);
 	return (ualloc_space);
 }
 
 static int
 load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg)
 {
 	maptype_t *uic_maptype = arg;
 
 	uint64_t offset = sme->sme_offset;
 	uint64_t size = sme->sme_run;
 	uint64_t vdev_id = sme->sme_vdev;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 
 	/* skip indirect vdevs */
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
 	ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 	if (*uic_maptype == sme->sme_type)
 		zfs_range_tree_add(ms->ms_allocatable, offset, size);
 	else
 		zfs_range_tree_remove(ms->ms_allocatable, offset, size);
 
 	return (0);
 }
 
 static void
 load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype)
 {
 	iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype);
 }
 
 static void
 load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *vd = rvd->vdev_child[i];
 
 		ASSERT3U(i, ==, vd->vdev_id);
 
 		if (vd->vdev_ops == &vdev_indirect_ops)
 			continue;
 
 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 
 			(void) fprintf(stderr,
 			    "\rloading concrete vdev %llu, "
 			    "metaslab %llu of %llu ...",
 			    (longlong_t)vd->vdev_id,
 			    (longlong_t)msp->ms_id,
 			    (longlong_t)vd->vdev_ms_count);
 
 			mutex_enter(&msp->ms_lock);
 			zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 
 			/*
 			 * We don't want to spend the CPU manipulating the
 			 * size-ordered tree, so clear the range_tree ops.
 			 */
 			msp->ms_allocatable->rt_ops = NULL;
 
 			if (msp->ms_sm != NULL) {
 				VERIFY0(space_map_load(msp->ms_sm,
 				    msp->ms_allocatable, maptype));
 			}
 			if (!msp->ms_loaded)
 				msp->ms_loaded = B_TRUE;
 			mutex_exit(&msp->ms_lock);
 		}
 	}
 
 	load_unflushed_to_ms_allocatables(spa, maptype);
 }
 
 /*
  * vm_idxp is an in-out parameter which (for indirect vdevs) is the
  * index in vim_entries that has the first entry in this metaslab.
  * On return, it will be set to the first entry after this metaslab.
  */
 static void
 load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
     uint64_t *vim_idxp)
 {
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 	mutex_enter(&msp->ms_lock);
 	zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 
 	/*
 	 * We don't want to spend the CPU manipulating the
 	 * size-ordered tree, so clear the range_tree ops.
 	 */
 	msp->ms_allocatable->rt_ops = NULL;
 
 	for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
 	    (*vim_idxp)++) {
 		vdev_indirect_mapping_entry_phys_t *vimep =
 		    &vim->vim_entries[*vim_idxp];
 		uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
 		uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
 		ASSERT3U(ent_offset, >=, msp->ms_start);
 		if (ent_offset >= msp->ms_start + msp->ms_size)
 			break;
 
 		/*
 		 * Mappings do not cross metaslab boundaries,
 		 * because we create them by walking the metaslabs.
 		 */
 		ASSERT3U(ent_offset + ent_len, <=,
 		    msp->ms_start + msp->ms_size);
 		zfs_range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
 	}
 
 	if (!msp->ms_loaded)
 		msp->ms_loaded = B_TRUE;
 	mutex_exit(&msp->ms_lock);
 }
 
 static void
 zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
 {
 	ASSERT(!dump_opt['L']);
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		ASSERT3U(c, ==, vd->vdev_id);
 
 		if (vd->vdev_ops != &vdev_indirect_ops)
 			continue;
 
 		/*
 		 * Note: we don't check for mapping leaks on
 		 * removing vdevs because their ms_allocatable's
 		 * are used to look for leaks in allocated space.
 		 */
 		zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
 
 		/*
 		 * Normally, indirect vdevs don't have any
 		 * metaslabs.  We want to set them up for
 		 * zio_claim().
 		 */
 		vdev_metaslab_group_create(vd);
 		VERIFY0(vdev_metaslab_init(vd, 0));
 
 		vdev_indirect_mapping_t *vim __maybe_unused =
 		    vd->vdev_indirect_mapping;
 		uint64_t vim_idx = 0;
 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 
 			(void) fprintf(stderr,
 			    "\rloading indirect vdev %llu, "
 			    "metaslab %llu of %llu ...",
 			    (longlong_t)vd->vdev_id,
 			    (longlong_t)vd->vdev_ms[m]->ms_id,
 			    (longlong_t)vd->vdev_ms_count);
 
 			load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
 			    &vim_idx);
 		}
 		ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
 	}
 }
 
 static void
 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
 	zcb->zcb_spa = spa;
 
 	if (dump_opt['L'])
 		return;
 
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * We are going to be changing the meaning of the metaslab's
 	 * ms_allocatable.  Ensure that the allocator doesn't try to
 	 * use the tree.
 	 */
 	spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
 	spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
 	spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops;
 
 	zcb->zcb_vd_obsolete_counts =
 	    umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
 	    UMEM_NOFAIL);
 
 	/*
 	 * For leak detection, we overload the ms_allocatable trees
 	 * to contain allocated segments instead of free segments.
 	 * As a result, we can't use the normal metaslab_load/unload
 	 * interfaces.
 	 */
 	zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
 	load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
 
 	/*
 	 * On load_concrete_ms_allocatable_trees() we loaded all the
 	 * allocated entries from the ms_sm to the ms_allocatable for
 	 * each metaslab. If the pool has a checkpoint or is in the
 	 * middle of discarding a checkpoint, some of these blocks
 	 * may have been freed but their ms_sm may not have been
 	 * updated because they are referenced by the checkpoint. In
 	 * order to avoid false-positives during leak-detection, we
 	 * go through the vdev's checkpoint space map and exclude all
 	 * its entries from their relevant ms_allocatable.
 	 *
 	 * We also aggregate the space held by the checkpoint and add
 	 * it to zcb_checkpoint_size.
 	 *
 	 * Note that at this point we are also verifying that all the
 	 * entries on the checkpoint_sm are marked as allocated in
 	 * the ms_sm of their relevant metaslab.
 	 * [see comment in checkpoint_sm_exclude_entry_cb()]
 	 */
 	zdb_leak_init_exclude_checkpoint(spa, zcb);
 	ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
 
 	/* for cleaner progress output */
 	(void) fprintf(stderr, "\n");
 
 	if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
 		ASSERT(spa_feature_is_enabled(spa,
 		    SPA_FEATURE_DEVICE_REMOVAL));
 		(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
 		    increment_indirect_mapping_cb, zcb, NULL);
 	}
 }
 
 static boolean_t
 zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
 {
 	boolean_t leaks = B_FALSE;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	uint64_t total_leaked = 0;
 	boolean_t are_precise = B_FALSE;
 
 	ASSERT(vim != NULL);
 
 	for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
 		vdev_indirect_mapping_entry_phys_t *vimep =
 		    &vim->vim_entries[i];
 		uint64_t obsolete_bytes = 0;
 		uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
 		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 		/*
 		 * This is not very efficient but it's easy to
 		 * verify correctness.
 		 */
 		for (uint64_t inner_offset = 0;
 		    inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
 		    inner_offset += 1ULL << vd->vdev_ashift) {
 			if (zfs_range_tree_contains(msp->ms_allocatable,
 			    offset + inner_offset, 1ULL << vd->vdev_ashift)) {
 				obsolete_bytes += 1ULL << vd->vdev_ashift;
 			}
 		}
 
 		int64_t bytes_leaked = obsolete_bytes -
 		    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];
 		ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,
 		    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);
 
 		VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 		if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) {
 			(void) printf("obsolete indirect mapping count "
 			    "mismatch on %llu:%llx:%llx : %llx bytes leaked\n",
 			    (u_longlong_t)vd->vdev_id,
 			    (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
 			    (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
 			    (u_longlong_t)bytes_leaked);
 		}
 		total_leaked += ABS(bytes_leaked);
 	}
 
 	VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 	if (!are_precise && total_leaked > 0) {
 		int pct_leaked = total_leaked * 100 /
 		    vdev_indirect_mapping_bytes_mapped(vim);
 		(void) printf("cannot verify obsolete indirect mapping "
 		    "counts of vdev %llu because precise feature was not "
 		    "enabled when it was removed: %d%% (%llx bytes) of mapping"
 		    "unreferenced\n",
 		    (u_longlong_t)vd->vdev_id, pct_leaked,
 		    (u_longlong_t)total_leaked);
 	} else if (total_leaked > 0) {
 		(void) printf("obsolete indirect mapping count mismatch "
 		    "for vdev %llu -- %llx total bytes mismatched\n",
 		    (u_longlong_t)vd->vdev_id,
 		    (u_longlong_t)total_leaked);
 		leaks |= B_TRUE;
 	}
 
 	vdev_indirect_mapping_free_obsolete_counts(vim,
 	    zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
 	zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
 
 	return (leaks);
 }
 
 static boolean_t
 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
 {
 	if (dump_opt['L'])
 		return (B_FALSE);
 
 	boolean_t leaks = B_FALSE;
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (unsigned c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
 			leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
 		}
 
 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 			ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class ==
 			    spa_embedded_log_class(spa)) ?
 			    vd->vdev_log_mg : vd->vdev_mg);
 
 			/*
 			 * ms_allocatable has been overloaded
 			 * to contain allocated segments. Now that
 			 * we finished traversing all blocks, any
 			 * block that remains in the ms_allocatable
 			 * represents an allocated block that we
 			 * did not claim during the traversal.
 			 * Claimed blocks would have been removed
 			 * from the ms_allocatable.  For indirect
 			 * vdevs, space remaining in the tree
 			 * represents parts of the mapping that are
 			 * not referenced, which is not a bug.
 			 */
 			if (vd->vdev_ops == &vdev_indirect_ops) {
 				zfs_range_tree_vacate(msp->ms_allocatable,
 				    NULL, NULL);
 			} else {
 				zfs_range_tree_vacate(msp->ms_allocatable,
 				    zdb_leak, vd);
 			}
 			if (msp->ms_loaded) {
 				msp->ms_loaded = B_FALSE;
 			}
 		}
 	}
 
 	umem_free(zcb->zcb_vd_obsolete_counts,
 	    rvd->vdev_children * sizeof (uint32_t *));
 	zcb->zcb_vd_obsolete_counts = NULL;
 
 	return (leaks);
 }
 
 static int
 count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	(void) tx;
 	zdb_cb_t *zcb = arg;
 
 	if (dump_opt['b'] >= 5) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("[%s] %s\n",
 		    "deferred free", blkbuf);
 	}
 	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
 	return (0);
 }
 
 /*
  * Iterate over livelists which have been destroyed by the user but
  * are still present in the MOS, waiting to be freed
  */
 static void
 iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t zap_obj;
 	int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
 	if (err == ENOENT)
 		return;
 	ASSERT0(err);
 
 	zap_cursor_t zc;
 	zap_attribute_t *attrp = zap_attribute_alloc();
 	dsl_deadlist_t ll;
 	/* NULL out os prior to dsl_deadlist_open in case it's garbage */
 	ll.dl_os = NULL;
 	for (zap_cursor_init(&zc, mos, zap_obj);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    (void) zap_cursor_advance(&zc)) {
 		VERIFY0(dsl_deadlist_open(&ll, mos, attrp->za_first_integer));
 		func(&ll, arg);
 		dsl_deadlist_close(&ll);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(attrp);
 }
 
 static int
 bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT(!bp_freed);
 	return (count_block_cb(arg, bp, tx));
 }
 
 static int
 livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle)
 {
 	zdb_cb_t *zbc = args;
 	bplist_t blks;
 	bplist_create(&blks);
 	/* determine which blocks have been alloc'd but not freed */
 	VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL));
 	/* count those blocks */
 	(void) bplist_iterate(&blks, count_block_cb, zbc, NULL);
 	bplist_destroy(&blks);
 	return (0);
 }
 
 static void
 livelist_count_blocks(dsl_deadlist_t *ll, void *arg)
 {
 	dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg);
 }
 
 /*
  * Count the blocks in the livelists that have been destroyed by the user
  * but haven't yet been freed.
  */
 static void
 deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc)
 {
 	iterate_deleted_livelists(spa, livelist_count_blocks, zbc);
 }
 
 static void
 dump_livelist_cb(dsl_deadlist_t *ll, void *arg)
 {
 	ASSERT3P(arg, ==, NULL);
 	global_feature_count[SPA_FEATURE_LIVELIST]++;
 	dump_blkptr_list(ll, "Deleted Livelist");
 	dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL);
 }
 
 /*
  * Print out, register object references to, and increment feature counts for
  * livelists that have been destroyed by the user but haven't yet been freed.
  */
 static void
 deleted_livelists_dump_mos(spa_t *spa)
 {
 	uint64_t zap_obj;
 	objset_t *mos = spa->spa_meta_objset;
 	int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
 	if (err == ENOENT)
 		return;
 	mos_obj_refd(zap_obj);
 	iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
 }
 
 static int
 zdb_brt_entry_compare(const void *zcn1, const void *zcn2)
 {
 	const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva;
 	const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva;
 	int cmp;
 
 	cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
 	if (cmp == 0)
 		cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2));
 
 	return (cmp);
 }
 
 static int
 dump_block_stats(spa_t *spa)
 {
 	zdb_cb_t *zcb;
 	zdb_blkstats_t *zb, *tzb;
 	uint64_t norm_alloc, norm_space, total_alloc, total_found;
 	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
 	    TRAVERSE_NO_DECRYPT | TRAVERSE_HARD;
 	boolean_t leaks = B_FALSE;
 	int e, c, err;
 	bp_embedded_type_t i;
 
 	ddt_prefetch_all(spa);
 
 	zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
 		avl_create(&zcb->zcb_brt, zdb_brt_entry_compare,
 		    sizeof (zdb_brt_entry_t),
 		    offsetof(zdb_brt_entry_t, zbre_node));
 		zcb->zcb_brt_is_active = B_TRUE;
 	}
 
 	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
 	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
 	    (dump_opt['c'] == 1) ? "metadata " : "",
 	    dump_opt['c'] ? "checksums " : "",
 	    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
 	    !dump_opt['L'] ? "nothing leaked " : "");
 
 	/*
 	 * When leak detection is enabled we load all space maps as SM_ALLOC
 	 * maps, then traverse the pool claiming each block we discover. If
 	 * the pool is perfectly consistent, the segment trees will be empty
 	 * when we're done. Anything left over is a leak; any block we can't
 	 * claim (because it's not part of any space map) is a double
 	 * allocation, reference to a freed block, or an unclaimed log block.
 	 *
 	 * When leak detection is disabled (-L option) we still traverse the
 	 * pool claiming each block we discover, but we skip opening any space
 	 * maps.
 	 */
 	zdb_leak_init(spa, zcb);
 
 	/*
 	 * If there's a deferred-free bplist, process that first.
 	 */
 	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
 	    bpobj_count_block_cb, zcb, NULL);
 
 	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 		(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
 		    bpobj_count_block_cb, zcb, NULL);
 	}
 
 	zdb_claim_removing(spa, zcb);
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
 		    spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
 		    zcb, NULL));
 	}
 
 	deleted_livelists_count_blocks(spa, zcb);
 
 	if (dump_opt['c'] > 1)
 		flags |= TRAVERSE_PREFETCH_DATA;
 
 	zcb->zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
 	zcb->zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
 	zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
 	zcb->zcb_totalasize +=
 	    metaslab_class_get_alloc(spa_embedded_log_class(spa));
 	zcb->zcb_start = zcb->zcb_lastprint = gethrtime();
 	err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb);
 
 	/*
 	 * If we've traversed the data blocks then we need to wait for those
 	 * I/Os to complete. We leverage "The Godfather" zio to wait on
 	 * all async I/Os to complete.
 	 */
 	if (dump_opt['c']) {
 		for (c = 0; c < max_ncpus; c++) {
 			(void) zio_wait(spa->spa_async_zio_root[c]);
 			spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 			    ZIO_FLAG_GODFATHER);
 		}
 	}
 	ASSERT0(spa->spa_load_verify_bytes);
 
 	/*
 	 * Done after zio_wait() since zcb_haderrors is modified in
 	 * zdb_blkptr_done()
 	 */
 	zcb->zcb_haderrors |= err;
 
 	if (zcb->zcb_haderrors) {
 		(void) printf("\nError counts:\n\n");
 		(void) printf("\t%5s  %s\n", "errno", "count");
 		for (e = 0; e < 256; e++) {
 			if (zcb->zcb_errors[e] != 0) {
 				(void) printf("\t%5d  %llu\n",
 				    e, (u_longlong_t)zcb->zcb_errors[e]);
 			}
 		}
 	}
 
 	/*
 	 * Report any leaked segments.
 	 */
 	leaks |= zdb_leak_fini(spa, zcb);
 
 	tzb = &zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
 
 	norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 	norm_space = metaslab_class_get_space(spa_normal_class(spa));
 
 	total_alloc = norm_alloc +
 	    metaslab_class_get_alloc(spa_log_class(spa)) +
 	    metaslab_class_get_alloc(spa_embedded_log_class(spa)) +
 	    metaslab_class_get_alloc(spa_special_class(spa)) +
 	    metaslab_class_get_alloc(spa_dedup_class(spa)) +
 	    get_unflushed_alloc_space(spa);
 	total_found =
 	    tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize +
 	    zcb->zcb_removing_size + zcb->zcb_checkpoint_size;
 
 	if (total_found == total_alloc && !dump_opt['L']) {
 		(void) printf("\n\tNo leaks (block sum matches space"
 		    " maps exactly)\n");
 	} else if (!dump_opt['L']) {
 		(void) printf("block traversal size %llu != alloc %llu "
 		    "(%s %lld)\n",
 		    (u_longlong_t)total_found,
 		    (u_longlong_t)total_alloc,
 		    (dump_opt['L']) ? "unreachable" : "leaked",
 		    (longlong_t)(total_alloc - total_found));
 	}
 
 	if (tzb->zb_count == 0) {
 		umem_free(zcb, sizeof (zdb_cb_t));
 		return (2);
 	}
 
 	(void) printf("\n");
 	(void) printf("\t%-16s %14llu\n", "bp count:",
 	    (u_longlong_t)tzb->zb_count);
 	(void) printf("\t%-16s %14llu\n", "ganged count:",
 	    (longlong_t)tzb->zb_gangs);
 	(void) printf("\t%-16s %14llu      avg: %6llu\n", "bp logical:",
 	    (u_longlong_t)tzb->zb_lsize,
 	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
 	(void) printf("\t%-16s %14llu      avg: %6llu     compression: %6.2f\n",
 	    "bp physical:", (u_longlong_t)tzb->zb_psize,
 	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_psize);
 	(void) printf("\t%-16s %14llu      avg: %6llu     compression: %6.2f\n",
 	    "bp allocated:", (u_longlong_t)tzb->zb_asize,
 	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_asize);
 	(void) printf("\t%-16s %14llu    ref>1: %6llu   deduplication: %6.2f\n",
 	    "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize,
 	    (u_longlong_t)zcb->zcb_dedup_blocks,
 	    (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0);
 	(void) printf("\t%-16s %14llu    count: %6llu\n",
 	    "bp cloned:", (u_longlong_t)zcb->zcb_clone_asize,
 	    (u_longlong_t)zcb->zcb_clone_blocks);
 	(void) printf("\t%-16s %14llu     used: %5.2f%%\n", "Normal class:",
 	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
 
 	if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) {
 		uint64_t alloc = metaslab_class_get_alloc(
 		    spa_special_class(spa));
 		uint64_t space = metaslab_class_get_space(
 		    spa_special_class(spa));
 
 		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
 		    "Special class", (u_longlong_t)alloc,
 		    100.0 * alloc / space);
 	}
 
 	if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) {
 		uint64_t alloc = metaslab_class_get_alloc(
 		    spa_dedup_class(spa));
 		uint64_t space = metaslab_class_get_space(
 		    spa_dedup_class(spa));
 
 		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
 		    "Dedup class", (u_longlong_t)alloc,
 		    100.0 * alloc / space);
 	}
 
 	if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) {
 		uint64_t alloc = metaslab_class_get_alloc(
 		    spa_embedded_log_class(spa));
 		uint64_t space = metaslab_class_get_space(
 		    spa_embedded_log_class(spa));
 
 		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
 		    "Embedded log class", (u_longlong_t)alloc,
 		    100.0 * alloc / space);
 	}
 
 	for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
 		if (zcb->zcb_embedded_blocks[i] == 0)
 			continue;
 		(void) printf("\n");
 		(void) printf("\tadditional, non-pointer bps of type %u: "
 		    "%10llu\n",
 		    i, (u_longlong_t)zcb->zcb_embedded_blocks[i]);
 
 		if (dump_opt['b'] >= 3) {
 			(void) printf("\t number of (compressed) bytes:  "
 			    "number of bps\n");
 			dump_histogram(zcb->zcb_embedded_histogram[i],
 			    sizeof (zcb->zcb_embedded_histogram[i]) /
 			    sizeof (zcb->zcb_embedded_histogram[i][0]), 0);
 		}
 	}
 
 	if (tzb->zb_ditto_samevdev != 0) {
 		(void) printf("\tDittoed blocks on same vdev: %llu\n",
 		    (longlong_t)tzb->zb_ditto_samevdev);
 	}
 	if (tzb->zb_ditto_same_ms != 0) {
 		(void) printf("\tDittoed blocks in same metaslab: %llu\n",
 		    (longlong_t)tzb->zb_ditto_same_ms);
 	}
 
 	for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
 		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 		if (vim == NULL) {
 			continue;
 		}
 
 		char mem[32];
 		zdb_nicenum(vdev_indirect_mapping_num_entries(vim),
 		    mem, vdev_indirect_mapping_size(vim));
 
 		(void) printf("\tindirect vdev id %llu has %llu segments "
 		    "(%s in memory)\n",
 		    (longlong_t)vd->vdev_id,
 		    (longlong_t)vdev_indirect_mapping_num_entries(vim), mem);
 	}
 
 	if (dump_opt['b'] >= 2) {
 		int l, t, level;
 		char csize[32], lsize[32], psize[32], asize[32];
 		char avg[32], gang[32];
 		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
 		    "\t  avg\t comp\t%%Total\tType\n");
 
 		zfs_blkstat_t *mdstats = umem_zalloc(sizeof (zfs_blkstat_t),
 		    UMEM_NOFAIL);
 
 		for (t = 0; t <= ZDB_OT_TOTAL; t++) {
 			const char *typename;
 
 			/* make sure nicenum has enough space */
 			_Static_assert(sizeof (csize) >= NN_NUMBUF_SZ,
 			    "csize truncated");
 			_Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ,
 			    "lsize truncated");
 			_Static_assert(sizeof (psize) >= NN_NUMBUF_SZ,
 			    "psize truncated");
 			_Static_assert(sizeof (asize) >= NN_NUMBUF_SZ,
 			    "asize truncated");
 			_Static_assert(sizeof (avg) >= NN_NUMBUF_SZ,
 			    "avg truncated");
 			_Static_assert(sizeof (gang) >= NN_NUMBUF_SZ,
 			    "gang truncated");
 
 			if (t < DMU_OT_NUMTYPES)
 				typename = dmu_ot[t].ot_name;
 			else
 				typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
 
 			if (zcb->zcb_type[ZB_TOTAL][t].zb_asize == 0) {
 				(void) printf("%6s\t%5s\t%5s\t%5s"
 				    "\t%5s\t%5s\t%6s\t%s\n",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    typename);
 				continue;
 			}
 
 			for (l = ZB_TOTAL - 1; l >= -1; l--) {
 				level = (l == -1 ? ZB_TOTAL : l);
 				zb = &zcb->zcb_type[level][t];
 
 				if (zb->zb_asize == 0)
 					continue;
 
 				if (level != ZB_TOTAL && t < DMU_OT_NUMTYPES &&
 				    (level > 0 || DMU_OT_IS_METADATA(t))) {
 					mdstats->zb_count += zb->zb_count;
 					mdstats->zb_lsize += zb->zb_lsize;
 					mdstats->zb_psize += zb->zb_psize;
 					mdstats->zb_asize += zb->zb_asize;
 					mdstats->zb_gangs += zb->zb_gangs;
 				}
 
 				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
 					continue;
 
 				if (level == 0 && zb->zb_asize ==
 				    zcb->zcb_type[ZB_TOTAL][t].zb_asize)
 					continue;
 
 				zdb_nicenum(zb->zb_count, csize,
 				    sizeof (csize));
 				zdb_nicenum(zb->zb_lsize, lsize,
 				    sizeof (lsize));
 				zdb_nicenum(zb->zb_psize, psize,
 				    sizeof (psize));
 				zdb_nicenum(zb->zb_asize, asize,
 				    sizeof (asize));
 				zdb_nicenum(zb->zb_asize / zb->zb_count, avg,
 				    sizeof (avg));
 				zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));
 
 				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
 				    "\t%5.2f\t%6.2f\t",
 				    csize, lsize, psize, asize, avg,
 				    (double)zb->zb_lsize / zb->zb_psize,
 				    100.0 * zb->zb_asize / tzb->zb_asize);
 
 				if (level == ZB_TOTAL)
 					(void) printf("%s\n", typename);
 				else
 					(void) printf("    L%d %s\n",
 					    level, typename);
 
 				if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
 					(void) printf("\t number of ganged "
 					    "blocks: %s\n", gang);
 				}
 
 				if (dump_opt['b'] >= 4) {
 					(void) printf("psize "
 					    "(in 512-byte sectors): "
 					    "number of blocks\n");
 					dump_histogram(zb->zb_psize_histogram,
 					    PSIZE_HISTO_SIZE, 0);
 				}
 			}
 		}
 		zdb_nicenum(mdstats->zb_count, csize,
 		    sizeof (csize));
 		zdb_nicenum(mdstats->zb_lsize, lsize,
 		    sizeof (lsize));
 		zdb_nicenum(mdstats->zb_psize, psize,
 		    sizeof (psize));
 		zdb_nicenum(mdstats->zb_asize, asize,
 		    sizeof (asize));
 		zdb_nicenum(mdstats->zb_asize / mdstats->zb_count, avg,
 		    sizeof (avg));
 		zdb_nicenum(mdstats->zb_gangs, gang, sizeof (gang));
 
 		(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
 		    "\t%5.2f\t%6.2f\t",
 		    csize, lsize, psize, asize, avg,
 		    (double)mdstats->zb_lsize / mdstats->zb_psize,
 		    100.0 * mdstats->zb_asize / tzb->zb_asize);
 		(void) printf("%s\n", "Metadata Total");
 
 		/* Output a table summarizing block sizes in the pool */
 		if (dump_opt['b'] >= 2) {
 			dump_size_histograms(zcb);
 		}
 
 		umem_free(mdstats, sizeof (zfs_blkstat_t));
 	}
 
 	(void) printf("\n");
 
 	if (leaks) {
 		umem_free(zcb, sizeof (zdb_cb_t));
 		return (2);
 	}
 
 	if (zcb->zcb_haderrors) {
 		umem_free(zcb, sizeof (zdb_cb_t));
 		return (3);
 	}
 
 	umem_free(zcb, sizeof (zdb_cb_t));
 	return (0);
 }
 
 typedef struct zdb_ddt_entry {
 	/* key must be first for ddt_key_compare */
 	ddt_key_t	zdde_key;
 	uint64_t	zdde_ref_blocks;
 	uint64_t	zdde_ref_lsize;
 	uint64_t	zdde_ref_psize;
 	uint64_t	zdde_ref_dsize;
 	avl_node_t	zdde_node;
 } zdb_ddt_entry_t;
 
 static int
 zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	(void) zilog, (void) dnp;
 	avl_tree_t *t = arg;
 	avl_index_t where;
 	zdb_ddt_entry_t *zdde, zdde_search;
 
 	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
 	    BP_IS_EMBEDDED(bp))
 		return (0);
 
 	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
 		(void) printf("traversing objset %llu, %llu objects, "
 		    "%lu blocks so far\n",
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)BP_GET_FILL(bp),
 		    avl_numnodes(t));
 	}
 
 	if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
 	    BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
 		return (0);
 
 	ddt_key_fill(&zdde_search.zdde_key, bp);
 
 	zdde = avl_find(t, &zdde_search, &where);
 
 	if (zdde == NULL) {
 		zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
 		zdde->zdde_key = zdde_search.zdde_key;
 		avl_insert(t, zdde, where);
 	}
 
 	zdde->zdde_ref_blocks += 1;
 	zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
 	zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
 	zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
 
 	return (0);
 }
 
 static void
 dump_simulated_ddt(spa_t *spa)
 {
 	avl_tree_t t;
 	void *cookie = NULL;
 	zdb_ddt_entry_t *zdde;
 	ddt_histogram_t ddh_total = {{{0}}};
 	ddt_stat_t dds_total = {0};
 
 	avl_create(&t, ddt_key_compare,
 	    sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
 	    TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
 		uint64_t refcnt = zdde->zdde_ref_blocks;
 		ASSERT(refcnt != 0);
 
 		ddt_stat_t *dds = &ddh_total.ddh_stat[highbit64(refcnt) - 1];
 
 		dds->dds_blocks += zdde->zdde_ref_blocks / refcnt;
 		dds->dds_lsize += zdde->zdde_ref_lsize / refcnt;
 		dds->dds_psize += zdde->zdde_ref_psize / refcnt;
 		dds->dds_dsize += zdde->zdde_ref_dsize / refcnt;
 
 		dds->dds_ref_blocks += zdde->zdde_ref_blocks;
 		dds->dds_ref_lsize += zdde->zdde_ref_lsize;
 		dds->dds_ref_psize += zdde->zdde_ref_psize;
 		dds->dds_ref_dsize += zdde->zdde_ref_dsize;
 
 		umem_free(zdde, sizeof (*zdde));
 	}
 
 	avl_destroy(&t);
 
 	ddt_histogram_total(&dds_total, &ddh_total);
 
 	(void) printf("Simulated DDT histogram:\n");
 
 	zpool_dump_ddt(&dds_total, &ddh_total);
 
 	dump_dedup_ratio(&dds_total);
 }
 
 static int
 verify_device_removal_feature_counts(spa_t *spa)
 {
 	uint64_t dr_feature_refcount = 0;
 	uint64_t oc_feature_refcount = 0;
 	uint64_t indirect_vdev_count = 0;
 	uint64_t precise_vdev_count = 0;
 	uint64_t obsolete_counts_object_count = 0;
 	uint64_t obsolete_sm_count = 0;
 	uint64_t obsolete_counts_count = 0;
 	uint64_t scip_count = 0;
 	uint64_t obsolete_bpobj_count = 0;
 	int ret = 0;
 
 	spa_condensing_indirect_phys_t *scip =
 	    &spa->spa_condensing_indirect_phys;
 	if (scip->scip_next_mapping_object != 0) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
 		ASSERT(scip->scip_prev_obsolete_sm_object != 0);
 		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 
 		(void) printf("Condensing indirect vdev %llu: new mapping "
 		    "object %llu, prev obsolete sm %llu\n",
 		    (u_longlong_t)scip->scip_vdev,
 		    (u_longlong_t)scip->scip_next_mapping_object,
 		    (u_longlong_t)scip->scip_prev_obsolete_sm_object);
 		if (scip->scip_prev_obsolete_sm_object != 0) {
 			space_map_t *prev_obsolete_sm = NULL;
 			VERIFY0(space_map_open(&prev_obsolete_sm,
 			    spa->spa_meta_objset,
 			    scip->scip_prev_obsolete_sm_object,
 			    0, vd->vdev_asize, 0));
 			dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
 			(void) printf("\n");
 			space_map_close(prev_obsolete_sm);
 		}
 
 		scip_count += 2;
 	}
 
 	for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		if (vic->vic_mapping_object != 0) {
 			ASSERT(vd->vdev_ops == &vdev_indirect_ops ||
 			    vd->vdev_removing);
 			indirect_vdev_count++;
 
 			if (vd->vdev_indirect_mapping->vim_havecounts) {
 				obsolete_counts_count++;
 			}
 		}
 
 		boolean_t are_precise;
 		VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 		if (are_precise) {
 			ASSERT(vic->vic_mapping_object != 0);
 			precise_vdev_count++;
 		}
 
 		uint64_t obsolete_sm_object;
 		VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 		if (obsolete_sm_object != 0) {
 			ASSERT(vic->vic_mapping_object != 0);
 			obsolete_sm_count++;
 		}
 	}
 
 	(void) feature_get_refcount(spa,
 	    &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],
 	    &dr_feature_refcount);
 	(void) feature_get_refcount(spa,
 	    &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],
 	    &oc_feature_refcount);
 
 	if (dr_feature_refcount != indirect_vdev_count) {
 		ret = 1;
 		(void) printf("Number of indirect vdevs (%llu) " \
 		    "does not match feature count (%llu)\n",
 		    (u_longlong_t)indirect_vdev_count,
 		    (u_longlong_t)dr_feature_refcount);
 	} else {
 		(void) printf("Verified device_removal feature refcount " \
 		    "of %llu is correct\n",
 		    (u_longlong_t)dr_feature_refcount);
 	}
 
 	if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_OBSOLETE_BPOBJ) == 0) {
 		obsolete_bpobj_count++;
 	}
 
 
 	obsolete_counts_object_count = precise_vdev_count;
 	obsolete_counts_object_count += obsolete_sm_count;
 	obsolete_counts_object_count += obsolete_counts_count;
 	obsolete_counts_object_count += scip_count;
 	obsolete_counts_object_count += obsolete_bpobj_count;
 	obsolete_counts_object_count += remap_deadlist_count;
 
 	if (oc_feature_refcount != obsolete_counts_object_count) {
 		ret = 1;
 		(void) printf("Number of obsolete counts objects (%llu) " \
 		    "does not match feature count (%llu)\n",
 		    (u_longlong_t)obsolete_counts_object_count,
 		    (u_longlong_t)oc_feature_refcount);
 		(void) printf("pv:%llu os:%llu oc:%llu sc:%llu "
 		    "ob:%llu rd:%llu\n",
 		    (u_longlong_t)precise_vdev_count,
 		    (u_longlong_t)obsolete_sm_count,
 		    (u_longlong_t)obsolete_counts_count,
 		    (u_longlong_t)scip_count,
 		    (u_longlong_t)obsolete_bpobj_count,
 		    (u_longlong_t)remap_deadlist_count);
 	} else {
 		(void) printf("Verified indirect_refcount feature refcount " \
 		    "of %llu is correct\n",
 		    (u_longlong_t)oc_feature_refcount);
 	}
 	return (ret);
 }
 
 static void
 zdb_set_skip_mmp(char *target)
 {
 	spa_t *spa;
 
 	/*
 	 * Disable the activity check to allow examination of
 	 * active pools.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(target)) != NULL) {
 		spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP;
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 #define	BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
 /*
  * Import the checkpointed state of the pool specified by the target
  * parameter as readonly. The function also accepts a pool config
  * as an optional parameter, else it attempts to infer the config by
  * the name of the target pool.
  *
  * Note that the checkpointed state's pool name will be the name of
  * the original pool with the above suffix appended to it. In addition,
  * if the target is not a pool name (e.g. a path to a dataset) then
  * the new_path parameter is populated with the updated path to
  * reflect the fact that we are looking into the checkpointed state.
  *
  * The function returns a newly-allocated copy of the name of the
  * pool containing the checkpointed state. When this copy is no
  * longer needed it should be freed with free(3C). Same thing
  * applies to the new_path parameter if allocated.
  */
 static char *
 import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
 {
 	int error = 0;
 	char *poolname, *bogus_name = NULL;
 	boolean_t freecfg = B_FALSE;
 
 	/* If the target is not a pool, the extract the pool name */
 	char *path_start = strchr(target, '/');
 	if (path_start != NULL) {
 		size_t poolname_len = path_start - target;
 		poolname = strndup(target, poolname_len);
 	} else {
 		poolname = target;
 	}
 
 	if (cfg == NULL) {
 		zdb_set_skip_mmp(poolname);
 		error = spa_get_stats(poolname, &cfg, NULL, 0);
 		if (error != 0) {
 			fatal("Tried to read config of pool \"%s\" but "
 			    "spa_get_stats() failed with error %d\n",
 			    poolname, error);
 		}
 		freecfg = B_TRUE;
 	}
 
 	if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) {
 		if (target != poolname)
 			free(poolname);
 		return (NULL);
 	}
 	fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
 
 	error = spa_import(bogus_name, cfg, NULL,
 	    ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT |
 	    ZFS_IMPORT_SKIP_MMP);
 	if (freecfg)
 		nvlist_free(cfg);
 	if (error != 0) {
 		fatal("Tried to import pool \"%s\" but spa_import() failed "
 		    "with error %d\n", bogus_name, error);
 	}
 
 	if (new_path != NULL && path_start != NULL) {
 		if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
 			free(bogus_name);
 			if (path_start != NULL)
 				free(poolname);
 			return (NULL);
 		}
 	}
 
 	if (target != poolname)
 		free(poolname);
 
 	return (bogus_name);
 }
 
 typedef struct verify_checkpoint_sm_entry_cb_arg {
 	vdev_t *vcsec_vd;
 
 	/* the following fields are only used for printing progress */
 	uint64_t vcsec_entryid;
 	uint64_t vcsec_num_entries;
 } verify_checkpoint_sm_entry_cb_arg_t;
 
 #define	ENTRIES_PER_PROGRESS_UPDATE 10000
 
 static int
 verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
 {
 	verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
 	vdev_t *vd = vcsec->vcsec_vd;
 	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
 	uint64_t end = sme->sme_offset + sme->sme_run;
 
 	ASSERT(sme->sme_type == SM_FREE);
 
 	if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
 		(void) fprintf(stderr,
 		    "\rverifying vdev %llu, space map entry %llu of %llu ...",
 		    (longlong_t)vd->vdev_id,
 		    (longlong_t)vcsec->vcsec_entryid,
 		    (longlong_t)vcsec->vcsec_num_entries);
 	}
 	vcsec->vcsec_entryid++;
 
 	/*
 	 * See comment in checkpoint_sm_exclude_entry_cb()
 	 */
 	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
 	 * The entries in the vdev_checkpoint_sm should be marked as
 	 * allocated in the checkpointed state of the pool, therefore
 	 * their respective ms_allocateable trees should not contain them.
 	 */
 	mutex_enter(&ms->ms_lock);
 	zfs_range_tree_verify_not_present(ms->ms_allocatable,
 	    sme->sme_offset, sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
 	return (0);
 }
 
 /*
  * Verify that all segments in the vdev_checkpoint_sm are allocated
  * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
  * ms_allocatable).
  *
  * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
  * each vdev in the current state of the pool to the metaslab space maps
  * (ms_sm) of the checkpointed state of the pool.
  *
  * Note that the function changes the state of the ms_allocatable
  * trees of the current spa_t. The entries of these ms_allocatable
  * trees are cleared out and then repopulated from with the free
  * entries of their respective ms_sm space maps.
  */
 static void
 verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
 {
 	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
 	vdev_t *current_rvd = current->spa_root_vdev;
 
 	load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
 
 	for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
 		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
 		vdev_t *current_vd = current_rvd->vdev_child[c];
 
 		space_map_t *checkpoint_sm = NULL;
 		uint64_t checkpoint_sm_obj;
 
 		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
 			/*
 			 * Since we don't allow device removal in a pool
 			 * that has a checkpoint, we expect that all removed
 			 * vdevs were removed from the pool before the
 			 * checkpoint.
 			 */
 			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
 			continue;
 		}
 
 		/*
 		 * If the checkpoint space map doesn't exist, then nothing
 		 * here is checkpointed so there's nothing to verify.
 		 */
 		if (current_vd->vdev_top_zap == 0 ||
 		    zap_contains(spa_meta_objset(current),
 		    current_vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
 			continue;
 
 		VERIFY0(zap_lookup(spa_meta_objset(current),
 		    current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
 		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
 
 		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
 		    checkpoint_sm_obj, 0, current_vd->vdev_asize,
 		    current_vd->vdev_ashift));
 
 		verify_checkpoint_sm_entry_cb_arg_t vcsec;
 		vcsec.vcsec_vd = ckpoint_vd;
 		vcsec.vcsec_entryid = 0;
 		vcsec.vcsec_num_entries =
 		    space_map_length(checkpoint_sm) / sizeof (uint64_t);
 		VERIFY0(space_map_iterate(checkpoint_sm,
 		    space_map_length(checkpoint_sm),
 		    verify_checkpoint_sm_entry_cb, &vcsec));
 		if (dump_opt['m'] > 3)
 			dump_spacemap(current->spa_meta_objset, checkpoint_sm);
 		space_map_close(checkpoint_sm);
 	}
 
 	/*
 	 * If we've added vdevs since we took the checkpoint, ensure
 	 * that their checkpoint space maps are empty.
 	 */
 	if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
 		for (uint64_t c = ckpoint_rvd->vdev_children;
 		    c < current_rvd->vdev_children; c++) {
 			vdev_t *current_vd = current_rvd->vdev_child[c];
 			VERIFY3P(current_vd->vdev_checkpoint_sm, ==, NULL);
 		}
 	}
 
 	/* for cleaner progress output */
 	(void) fprintf(stderr, "\n");
 }
 
 /*
  * Verifies that all space that's allocated in the checkpoint is
  * still allocated in the current version, by checking that everything
  * in checkpoint's ms_allocatable (which is actually allocated, not
  * allocatable/free) is not present in current's ms_allocatable.
  *
  * Note that the function changes the state of the ms_allocatable
  * trees of both spas when called. The entries of all ms_allocatable
  * trees are cleared out and then repopulated from their respective
  * ms_sm space maps. In the checkpointed state we load the allocated
  * entries, and in the current state we load the free entries.
  */
 static void
 verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
 {
 	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
 	vdev_t *current_rvd = current->spa_root_vdev;
 
 	load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
 	load_concrete_ms_allocatable_trees(current, SM_FREE);
 
 	for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
 		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
 		vdev_t *current_vd = current_rvd->vdev_child[i];
 
 		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
 			/*
 			 * See comment in verify_checkpoint_vdev_spacemaps()
 			 */
 			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
 			continue;
 		}
 
 		for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
 			metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
 			metaslab_t *current_msp = current_vd->vdev_ms[m];
 
 			(void) fprintf(stderr,
 			    "\rverifying vdev %llu of %llu, "
 			    "metaslab %llu of %llu ...",
 			    (longlong_t)current_vd->vdev_id,
 			    (longlong_t)current_rvd->vdev_children,
 			    (longlong_t)current_vd->vdev_ms[m]->ms_id,
 			    (longlong_t)current_vd->vdev_ms_count);
 
 			/*
 			 * We walk through the ms_allocatable trees that
 			 * are loaded with the allocated blocks from the
 			 * ms_sm spacemaps of the checkpoint. For each
 			 * one of these ranges we ensure that none of them
 			 * exists in the ms_allocatable trees of the
 			 * current state which are loaded with the ranges
 			 * that are currently free.
 			 *
 			 * This way we ensure that none of the blocks that
 			 * are part of the checkpoint were freed by mistake.
 			 */
 			zfs_range_tree_walk(ckpoint_msp->ms_allocatable,
 			    (zfs_range_tree_func_t *)
 			    zfs_range_tree_verify_not_present,
 			    current_msp->ms_allocatable);
 		}
 	}
 
 	/* for cleaner progress output */
 	(void) fprintf(stderr, "\n");
 }
 
 static void
 verify_checkpoint_blocks(spa_t *spa)
 {
 	ASSERT(!dump_opt['L']);
 
 	spa_t *checkpoint_spa;
 	char *checkpoint_pool;
 	int error = 0;
 
 	/*
 	 * We import the checkpointed state of the pool (under a different
 	 * name) so we can do verification on it against the current state
 	 * of the pool.
 	 */
 	checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL,
 	    NULL);
 	ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
 
 	error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
 	if (error != 0) {
 		fatal("Tried to open pool \"%s\" but spa_open() failed with "
 		    "error %d\n", checkpoint_pool, error);
 	}
 
 	/*
 	 * Ensure that ranges in the checkpoint space maps of each vdev
 	 * are allocated according to the checkpointed state's metaslab
 	 * space maps.
 	 */
 	verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
 
 	/*
 	 * Ensure that allocated ranges in the checkpoint's metaslab
 	 * space maps remain allocated in the metaslab space maps of
 	 * the current state.
 	 */
 	verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
 
 	/*
 	 * Once we are done, we get rid of the checkpointed state.
 	 */
 	spa_close(checkpoint_spa, FTAG);
 	free(checkpoint_pool);
 }
 
 static void
 dump_leftover_checkpoint_blocks(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *vd = rvd->vdev_child[i];
 
 		space_map_t *checkpoint_sm = NULL;
 		uint64_t checkpoint_sm_obj;
 
 		if (vd->vdev_top_zap == 0)
 			continue;
 
 		if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
 			continue;
 
 		VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
 		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
 
 		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
 		    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
 		dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
 		space_map_close(checkpoint_sm);
 	}
 }
 
 static int
 verify_checkpoint(spa_t *spa)
 {
 	uberblock_t checkpoint;
 	int error;
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
 		return (0);
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
 
 	if (error == ENOENT && !dump_opt['L']) {
 		/*
 		 * If the feature is active but the uberblock is missing
 		 * then we must be in the middle of discarding the
 		 * checkpoint.
 		 */
 		(void) printf("\nPartially discarded checkpoint "
 		    "state found:\n");
 		if (dump_opt['m'] > 3)
 			dump_leftover_checkpoint_blocks(spa);
 		return (0);
 	} else if (error != 0) {
 		(void) printf("lookup error %d when looking for "
 		    "checkpointed uberblock in MOS\n", error);
 		return (error);
 	}
 	dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
 
 	if (checkpoint.ub_checkpoint_txg == 0) {
 		(void) printf("\nub_checkpoint_txg not set in checkpointed "
 		    "uberblock\n");
 		error = 3;
 	}
 
 	if (error == 0 && !dump_opt['L'])
 		verify_checkpoint_blocks(spa);
 
 	return (error);
 }
 
 static void
 mos_leaks_cb(void *arg, uint64_t start, uint64_t size)
 {
 	(void) arg;
 	for (uint64_t i = start; i < size; i++) {
 		(void) printf("MOS object %llu referenced but not allocated\n",
 		    (u_longlong_t)i);
 	}
 }
 
 static void
 mos_obj_refd(uint64_t obj)
 {
 	if (obj != 0 && mos_refd_objs != NULL)
 		zfs_range_tree_add(mos_refd_objs, obj, 1);
 }
 
 /*
  * Call on a MOS object that may already have been referenced.
  */
 static void
 mos_obj_refd_multiple(uint64_t obj)
 {
 	if (obj != 0 && mos_refd_objs != NULL &&
 	    !zfs_range_tree_contains(mos_refd_objs, obj, 1))
 		zfs_range_tree_add(mos_refd_objs, obj, 1);
 }
 
 static void
 mos_leak_vdev_top_zap(vdev_t *vd)
 {
 	uint64_t ms_flush_data_obj;
 	int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
 	    sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj);
 	if (error == ENOENT)
 		return;
 	ASSERT0(error);
 
 	mos_obj_refd(ms_flush_data_obj);
 }
 
 static void
 mos_leak_vdev(vdev_t *vd)
 {
 	mos_obj_refd(vd->vdev_dtl_object);
 	mos_obj_refd(vd->vdev_ms_array);
 	mos_obj_refd(vd->vdev_indirect_config.vic_births_object);
 	mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object);
 	mos_obj_refd(vd->vdev_leaf_zap);
 	if (vd->vdev_checkpoint_sm != NULL)
 		mos_obj_refd(vd->vdev_checkpoint_sm->sm_object);
 	if (vd->vdev_indirect_mapping != NULL) {
 		mos_obj_refd(vd->vdev_indirect_mapping->
 		    vim_phys->vimp_counts_object);
 	}
 	if (vd->vdev_obsolete_sm != NULL)
 		mos_obj_refd(vd->vdev_obsolete_sm->sm_object);
 
 	for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *ms = vd->vdev_ms[m];
 		mos_obj_refd(space_map_object(ms->ms_sm));
 	}
 
 	if (vd->vdev_root_zap != 0)
 		mos_obj_refd(vd->vdev_root_zap);
 
 	if (vd->vdev_top_zap != 0) {
 		mos_obj_refd(vd->vdev_top_zap);
 		mos_leak_vdev_top_zap(vd);
 	}
 
 	for (uint64_t c = 0; c < vd->vdev_children; c++) {
 		mos_leak_vdev(vd->vdev_child[c]);
 	}
 }
 
 static void
 mos_leak_log_spacemaps(spa_t *spa)
 {
 	uint64_t spacemap_zap;
 	int error = zap_lookup(spa_meta_objset(spa),
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP,
 	    sizeof (spacemap_zap), 1, &spacemap_zap);
 	if (error == ENOENT)
 		return;
 	ASSERT0(error);
 
 	mos_obj_refd(spacemap_zap);
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls))
 		mos_obj_refd(sls->sls_sm_obj);
 }
 
 static void
 errorlog_count_refd(objset_t *mos, uint64_t errlog)
 {
 	zap_cursor_t zc;
 	zap_attribute_t *za = zap_attribute_alloc();
 	for (zap_cursor_init(&zc, mos, errlog);
 	    zap_cursor_retrieve(&zc, za) == 0;
 	    zap_cursor_advance(&zc)) {
 		mos_obj_refd(za->za_first_integer);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(za);
 }
 
 static int
 dump_mos_leaks(spa_t *spa)
 {
 	int rv = 0;
 	objset_t *mos = spa->spa_meta_objset;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 
 	/* Visit and mark all referenced objects in the MOS */
 
 	mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT);
 	mos_obj_refd(spa->spa_pool_props_object);
 	mos_obj_refd(spa->spa_config_object);
 	mos_obj_refd(spa->spa_ddt_stat_object);
 	mos_obj_refd(spa->spa_feat_desc_obj);
 	mos_obj_refd(spa->spa_feat_enabled_txg_obj);
 	mos_obj_refd(spa->spa_feat_for_read_obj);
 	mos_obj_refd(spa->spa_feat_for_write_obj);
 	mos_obj_refd(spa->spa_history);
 	mos_obj_refd(spa->spa_errlog_last);
 	mos_obj_refd(spa->spa_errlog_scrub);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		errorlog_count_refd(mos, spa->spa_errlog_last);
 		errorlog_count_refd(mos, spa->spa_errlog_scrub);
 	}
 
 	mos_obj_refd(spa->spa_all_vdev_zaps);
 	mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj);
 	mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj);
 	mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj);
 	bpobj_count_refd(&spa->spa_deferred_bpobj);
 	mos_obj_refd(dp->dp_empty_bpobj);
 	bpobj_count_refd(&dp->dp_obsolete_bpobj);
 	bpobj_count_refd(&dp->dp_free_bpobj);
 	mos_obj_refd(spa->spa_l2cache.sav_object);
 	mos_obj_refd(spa->spa_spares.sav_object);
 
 	if (spa->spa_syncing_log_sm != NULL)
 		mos_obj_refd(spa->spa_syncing_log_sm->sm_object);
 	mos_leak_log_spacemaps(spa);
 
 	mos_obj_refd(spa->spa_condensing_indirect_phys.
 	    scip_next_mapping_object);
 	mos_obj_refd(spa->spa_condensing_indirect_phys.
 	    scip_prev_obsolete_sm_object);
 	if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) {
 		vdev_indirect_mapping_t *vim =
 		    vdev_indirect_mapping_open(mos,
 		    spa->spa_condensing_indirect_phys.scip_next_mapping_object);
 		mos_obj_refd(vim->vim_phys->vimp_counts_object);
 		vdev_indirect_mapping_close(vim);
 	}
 	deleted_livelists_dump_mos(spa);
 
 	if (dp->dp_origin_snap != NULL) {
 		dsl_dataset_t *ds;
 
 		dsl_pool_config_enter(dp, FTAG);
 		VERIFY0(dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,
 		    FTAG, &ds));
 		count_ds_mos_objects(ds);
 		dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
 		dsl_dataset_rele(ds, FTAG);
 		dsl_pool_config_exit(dp, FTAG);
 
 		count_ds_mos_objects(dp->dp_origin_snap);
 		dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist");
 	}
 	count_dir_mos_objects(dp->dp_mos_dir);
 	if (dp->dp_free_dir != NULL)
 		count_dir_mos_objects(dp->dp_free_dir);
 	if (dp->dp_leak_dir != NULL)
 		count_dir_mos_objects(dp->dp_leak_dir);
 
 	mos_leak_vdev(spa->spa_root_vdev);
 
 	for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
 			continue;
 
 		/* DDT store objects */
 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 			for (ddt_class_t class = 0; class < DDT_CLASSES;
 			    class++) {
 				mos_obj_refd(ddt->ddt_object[type][class]);
 			}
 		}
 
 		/* FDT container */
 		if (ddt->ddt_version == DDT_VERSION_FDT)
 			mos_obj_refd(ddt->ddt_dir_object);
 
 		/* FDT log objects */
 		if (ddt->ddt_flags & DDT_FLAG_LOG) {
 			mos_obj_refd(ddt->ddt_log[0].ddl_object);
 			mos_obj_refd(ddt->ddt_log[1].ddl_object);
 		}
 	}
 
 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
 		if (brtvd->bv_initiated) {
 			mos_obj_refd(brtvd->bv_mos_brtvdev);
 			mos_obj_refd(brtvd->bv_mos_entries);
 		}
 	}
 
 	/*
 	 * Visit all allocated objects and make sure they are referenced.
 	 */
 	uint64_t object = 0;
 	while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) {
 		if (zfs_range_tree_contains(mos_refd_objs, object, 1)) {
 			zfs_range_tree_remove(mos_refd_objs, object, 1);
 		} else {
 			dmu_object_info_t doi;
 			const char *name;
 			VERIFY0(dmu_object_info(mos, object, &doi));
 			if (doi.doi_type & DMU_OT_NEWTYPE) {
 				dmu_object_byteswap_t bswap =
 				    DMU_OT_BYTESWAP(doi.doi_type);
 				name = dmu_ot_byteswap[bswap].ob_name;
 			} else {
 				name = dmu_ot[doi.doi_type].ot_name;
 			}
 
 			(void) printf("MOS object %llu (%s) leaked\n",
 			    (u_longlong_t)object, name);
 			rv = 2;
 		}
 	}
 	(void) zfs_range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL);
 	if (!zfs_range_tree_is_empty(mos_refd_objs))
 		rv = 2;
 	zfs_range_tree_vacate(mos_refd_objs, NULL, NULL);
 	zfs_range_tree_destroy(mos_refd_objs);
 	return (rv);
 }
 
 typedef struct log_sm_obsolete_stats_arg {
 	uint64_t lsos_current_txg;
 
 	uint64_t lsos_total_entries;
 	uint64_t lsos_valid_entries;
 
 	uint64_t lsos_sm_entries;
 	uint64_t lsos_valid_sm_entries;
 } log_sm_obsolete_stats_arg_t;
 
 static int
 log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme,
     uint64_t txg, void *arg)
 {
 	log_sm_obsolete_stats_arg_t *lsos = arg;
 
 	uint64_t offset = sme->sme_offset;
 	uint64_t vdev_id = sme->sme_vdev;
 
 	if (lsos->lsos_current_txg == 0) {
 		/* this is the first log */
 		lsos->lsos_current_txg = txg;
 	} else if (lsos->lsos_current_txg < txg) {
 		/* we just changed log - print stats and reset */
 		(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
 		    (u_longlong_t)lsos->lsos_valid_sm_entries,
 		    (u_longlong_t)lsos->lsos_sm_entries,
 		    (u_longlong_t)lsos->lsos_current_txg);
 		lsos->lsos_valid_sm_entries = 0;
 		lsos->lsos_sm_entries = 0;
 		lsos->lsos_current_txg = txg;
 	}
 	ASSERT3U(lsos->lsos_current_txg, ==, txg);
 
 	lsos->lsos_sm_entries++;
 	lsos->lsos_total_entries++;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 	lsos->lsos_valid_sm_entries++;
 	lsos->lsos_valid_entries++;
 	return (0);
 }
 
 static void
 dump_log_spacemap_obsolete_stats(spa_t *spa)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	log_sm_obsolete_stats_arg_t lsos = {0};
 
 	(void) printf("Log Space Map Obsolete Entry Statistics:\n");
 
 	iterate_through_spacemap_logs(spa,
 	    log_spacemap_obsolete_stats_cb, &lsos);
 
 	/* print stats for latest log */
 	(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
 	    (u_longlong_t)lsos.lsos_valid_sm_entries,
 	    (u_longlong_t)lsos.lsos_sm_entries,
 	    (u_longlong_t)lsos.lsos_current_txg);
 
 	(void) printf("%-8llu valid entries out of %-8llu - total\n\n",
 	    (u_longlong_t)lsos.lsos_valid_entries,
 	    (u_longlong_t)lsos.lsos_total_entries);
 }
 
 static void
 dump_zpool(spa_t *spa)
 {
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	int rc = 0;
 
 	if (dump_opt['y']) {
 		livelist_metaslab_validate(spa);
 	}
 
 	if (dump_opt['S']) {
 		dump_simulated_ddt(spa);
 		return;
 	}
 
 	if (!dump_opt['e'] && dump_opt['C'] > 1) {
 		(void) printf("\nCached configuration:\n");
 		dump_nvlist(spa->spa_config, 8);
 	}
 
 	if (dump_opt['C'])
 		dump_config(spa);
 
 	if (dump_opt['u'])
 		dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
 
 	if (dump_opt['D'])
 		dump_all_ddts(spa);
 
 	if (dump_opt['T'])
 		dump_brt(spa);
 
 	if (dump_opt['d'] > 2 || dump_opt['m'])
 		dump_metaslabs(spa);
 	if (dump_opt['M'])
 		dump_metaslab_groups(spa, dump_opt['M'] > 1);
 	if (dump_opt['d'] > 2 || dump_opt['m']) {
 		dump_log_spacemaps(spa);
 		dump_log_spacemap_obsolete_stats(spa);
 	}
 
 	if (dump_opt['d'] || dump_opt['i']) {
 		spa_feature_t f;
 		mos_refd_objs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
 		    NULL, 0, 0);
 		dump_objset(dp->dp_meta_objset);
 
 		if (dump_opt['d'] >= 3) {
 			dsl_pool_t *dp = spa->spa_dsl_pool;
 			dump_full_bpobj(&spa->spa_deferred_bpobj,
 			    "Deferred frees", 0);
 			if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 				dump_full_bpobj(&dp->dp_free_bpobj,
 				    "Pool snapshot frees", 0);
 			}
 			if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
 				ASSERT(spa_feature_is_enabled(spa,
 				    SPA_FEATURE_DEVICE_REMOVAL));
 				dump_full_bpobj(&dp->dp_obsolete_bpobj,
 				    "Pool obsolete blocks", 0);
 			}
 
 			if (spa_feature_is_active(spa,
 			    SPA_FEATURE_ASYNC_DESTROY)) {
 				dump_bptree(spa->spa_meta_objset,
 				    dp->dp_bptree_obj,
 				    "Pool dataset frees");
 			}
 			dump_dtl(spa->spa_root_vdev, 0);
 		}
 
 		for (spa_feature_t f = 0; f < SPA_FEATURES; f++)
 			global_feature_count[f] = UINT64_MAX;
 		global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0;
 		global_feature_count[SPA_FEATURE_REDACTION_LIST_SPILL] = 0;
 		global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0;
 		global_feature_count[SPA_FEATURE_LIVELIST] = 0;
 
 		(void) dmu_objset_find(spa_name(spa), dump_one_objset,
 		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 
 		if (rc == 0 && !dump_opt['L'])
 			rc = dump_mos_leaks(spa);
 
 		for (f = 0; f < SPA_FEATURES; f++) {
 			uint64_t refcount;
 
 			uint64_t *arr;
 			if (!(spa_feature_table[f].fi_flags &
 			    ZFEATURE_FLAG_PER_DATASET)) {
 				if (global_feature_count[f] == UINT64_MAX)
 					continue;
 				if (!spa_feature_is_enabled(spa, f)) {
 					ASSERT0(global_feature_count[f]);
 					continue;
 				}
 				arr = global_feature_count;
 			} else {
 				if (!spa_feature_is_enabled(spa, f)) {
 					ASSERT0(dataset_feature_count[f]);
 					continue;
 				}
 				arr = dataset_feature_count;
 			}
 			if (feature_get_refcount(spa, &spa_feature_table[f],
 			    &refcount) == ENOTSUP)
 				continue;
 			if (arr[f] != refcount) {
 				(void) printf("%s feature refcount mismatch: "
 				    "%lld consumers != %lld refcount\n",
 				    spa_feature_table[f].fi_uname,
 				    (longlong_t)arr[f], (longlong_t)refcount);
 				rc = 2;
 			} else {
 				(void) printf("Verified %s feature refcount "
 				    "of %llu is correct\n",
 				    spa_feature_table[f].fi_uname,
 				    (longlong_t)refcount);
 			}
 		}
 
 		if (rc == 0)
 			rc = verify_device_removal_feature_counts(spa);
 	}
 
 	if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
 		rc = dump_block_stats(spa);
 
 	if (rc == 0)
 		rc = verify_spacemap_refcounts(spa);
 
 	if (dump_opt['s'])
 		show_pool_stats(spa);
 
 	if (dump_opt['h'])
 		dump_history(spa);
 
 	if (rc == 0)
 		rc = verify_checkpoint(spa);
 
 	if (rc != 0) {
 		dump_debug_buffer();
 		zdb_exit(rc);
 	}
 }
 
 #define	ZDB_FLAG_CHECKSUM	0x0001
 #define	ZDB_FLAG_DECOMPRESS	0x0002
 #define	ZDB_FLAG_BSWAP		0x0004
 #define	ZDB_FLAG_GBH		0x0008
 #define	ZDB_FLAG_INDIRECT	0x0010
 #define	ZDB_FLAG_RAW		0x0020
 #define	ZDB_FLAG_PRINT_BLKPTR	0x0040
 #define	ZDB_FLAG_VERBOSE	0x0080
 
 static int flagbits[256];
 static char flagbitstr[16];
 
 static void
 zdb_print_blkptr(const blkptr_t *bp, int flags)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
 
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s\n", blkbuf);
 }
 
 static void
 zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
 {
 	int i;
 
 	for (i = 0; i < nbps; i++)
 		zdb_print_blkptr(&bp[i], flags);
 }
 
 static void
 zdb_dump_gbh(void *buf, int flags)
 {
 	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
 }
 
 static void
 zdb_dump_block_raw(void *buf, uint64_t size, int flags)
 {
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array(buf, size);
 	VERIFY(write(fileno(stdout), buf, size) == size);
 }
 
 static void
 zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
 {
 	uint64_t *d = (uint64_t *)buf;
 	unsigned nwords = size / sizeof (uint64_t);
 	int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
 	unsigned i, j;
 	const char *hdr;
 	char *c;
 
 
 	if (do_bswap)
 		hdr = " 7 6 5 4 3 2 1 0   f e d c b a 9 8";
 	else
 		hdr = " 0 1 2 3 4 5 6 7   8 9 a b c d e f";
 
 	(void) printf("\n%s\n%6s   %s  0123456789abcdef\n", label, "", hdr);
 
 #ifdef _ZFS_LITTLE_ENDIAN
 	/* correct the endianness */
 	do_bswap = !do_bswap;
 #endif
 	for (i = 0; i < nwords; i += 2) {
 		(void) printf("%06llx:  %016llx  %016llx  ",
 		    (u_longlong_t)(i * sizeof (uint64_t)),
 		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
 		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
 
 		c = (char *)&d[i];
 		for (j = 0; j < 2 * sizeof (uint64_t); j++)
 			(void) printf("%c", isprint(c[j]) ? c[j] : '.');
 		(void) printf("\n");
 	}
 }
 
 /*
  * There are two acceptable formats:
  *	leaf_name	  - For example: c1t0d0 or /tmp/ztest.0a
  *	child[.child]*    - For example: 0.1.1
  *
  * The second form can be used to specify arbitrary vdevs anywhere
  * in the hierarchy.  For example, in a pool with a mirror of
  * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
  */
 static vdev_t *
 zdb_vdev_lookup(vdev_t *vdev, const char *path)
 {
 	char *s, *p, *q;
 	unsigned i;
 
 	if (vdev == NULL)
 		return (NULL);
 
 	/* First, assume the x.x.x.x format */
 	i = strtoul(path, &s, 10);
 	if (s == path || (s && *s != '.' && *s != '\0'))
 		goto name;
 	if (i >= vdev->vdev_children)
 		return (NULL);
 
 	vdev = vdev->vdev_child[i];
 	if (s && *s == '\0')
 		return (vdev);
 	return (zdb_vdev_lookup(vdev, s+1));
 
 name:
 	for (i = 0; i < vdev->vdev_children; i++) {
 		vdev_t *vc = vdev->vdev_child[i];
 
 		if (vc->vdev_path == NULL) {
 			vc = zdb_vdev_lookup(vc, path);
 			if (vc == NULL)
 				continue;
 			else
 				return (vc);
 		}
 
 		p = strrchr(vc->vdev_path, '/');
 		p = p ? p + 1 : vc->vdev_path;
 		q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
 
 		if (strcmp(vc->vdev_path, path) == 0)
 			return (vc);
 		if (strcmp(p, path) == 0)
 			return (vc);
 		if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
 			return (vc);
 	}
 
 	return (NULL);
 }
 
 static int
 name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr)
 {
 	dsl_dataset_t *ds;
 
 	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
 	int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id,
 	    NULL, &ds);
 	if (error != 0) {
 		(void) fprintf(stderr, "failed to hold objset %llu: %s\n",
 		    (u_longlong_t)objset_id, strerror(error));
 		dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 		return (error);
 	}
 	dsl_dataset_name(ds, outstr);
 	dsl_dataset_rele(ds, NULL);
 	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 	return (0);
 }
 
 static boolean_t
 zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize)
 {
 	char *s0, *s1, *tmp = NULL;
 
 	if (sizes == NULL)
 		return (B_FALSE);
 
 	s0 = strtok_r(sizes, "/", &tmp);
 	if (s0 == NULL)
 		return (B_FALSE);
 	s1 = strtok_r(NULL, "/", &tmp);
 	*lsize = strtoull(s0, NULL, 16);
 	*psize = s1 ? strtoull(s1, NULL, 16) : *lsize;
 	return (*lsize >= *psize && *psize > 0);
 }
 
 #define	ZIO_COMPRESS_MASK(alg)	(1ULL << (ZIO_COMPRESS_##alg))
 
 static boolean_t
 try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize,
     int flags, int cfunc, void *lbuf, void *lbuf2)
 {
 	if (flags & ZDB_FLAG_VERBOSE) {
 		(void) fprintf(stderr,
 		    "Trying %05llx -> %05llx (%s)\n",
 		    (u_longlong_t)psize,
 		    (u_longlong_t)lsize,
 		    zio_compress_table[cfunc].ci_name);
 	}
 
 	/*
 	 * We set lbuf to all zeros and lbuf2 to all
 	 * ones, then decompress to both buffers and
 	 * compare their contents. This way we can
 	 * know if decompression filled exactly to
 	 * lsize or if it left some bytes unwritten.
 	 */
 
 	memset(lbuf, 0x00, lsize);
 	memset(lbuf2, 0xff, lsize);
 
 	abd_t labd, labd2;
 	abd_get_from_buf_struct(&labd, lbuf, lsize);
 	abd_get_from_buf_struct(&labd2, lbuf2, lsize);
 
 	boolean_t ret = B_FALSE;
 	if (zio_decompress_data(cfunc, pabd,
 	    &labd, psize, lsize, NULL) == 0 &&
 	    zio_decompress_data(cfunc, pabd,
 	    &labd2, psize, lsize, NULL) == 0 &&
 	    memcmp(lbuf, lbuf2, lsize) == 0)
 		ret = B_TRUE;
 
 	abd_free(&labd2);
 	abd_free(&labd);
 
 	return (ret);
 }
 
 static uint64_t
 zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize,
     uint64_t psize, int flags)
 {
 	(void) buf;
 	uint64_t orig_lsize = lsize;
 	boolean_t tryzle = ((getenv("ZDB_NO_ZLE") == NULL));
 	boolean_t found = B_FALSE;
 	/*
 	 * We don't know how the data was compressed, so just try
 	 * every decompress function at every inflated blocksize.
 	 */
 	void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 	int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 };
 	int *cfuncp = cfuncs;
 	uint64_t maxlsize = SPA_MAXBLOCKSIZE;
 	uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) |
 	    ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) |
 	    ZIO_COMPRESS_MASK(ZLE);
 	*cfuncp++ = ZIO_COMPRESS_LZ4;
 	*cfuncp++ = ZIO_COMPRESS_LZJB;
 	mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB);
 	/*
 	 * Every gzip level has the same decompressor, no need to
 	 * run it 9 times per bruteforce attempt.
 	 */
 	mask |= ZIO_COMPRESS_MASK(GZIP_2) | ZIO_COMPRESS_MASK(GZIP_3);
 	mask |= ZIO_COMPRESS_MASK(GZIP_4) | ZIO_COMPRESS_MASK(GZIP_5);
 	mask |= ZIO_COMPRESS_MASK(GZIP_6) | ZIO_COMPRESS_MASK(GZIP_7);
 	mask |= ZIO_COMPRESS_MASK(GZIP_8) | ZIO_COMPRESS_MASK(GZIP_9);
 	for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++)
 		if (((1ULL << c) & mask) == 0)
 			*cfuncp++ = c;
 
 	/*
 	 * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this
 	 * could take a while and we should let the user know
 	 * we are not stuck.  On the other hand, printing progress
 	 * info gets old after a while.  User can specify 'v' flag
 	 * to see the progression.
 	 */
 	if (lsize == psize)
 		lsize += SPA_MINBLOCKSIZE;
 	else
 		maxlsize = lsize;
 
 	for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) {
 		for (cfuncp = cfuncs; *cfuncp; cfuncp++) {
 			if (try_decompress_block(pabd, lsize, psize, flags,
 			    *cfuncp, lbuf, lbuf2)) {
 				found = B_TRUE;
 				break;
 			}
 		}
 		if (*cfuncp != 0)
 			break;
 	}
 	if (!found && tryzle) {
 		for (lsize = orig_lsize; lsize <= maxlsize;
 		    lsize += SPA_MINBLOCKSIZE) {
 			if (try_decompress_block(pabd, lsize, psize, flags,
 			    ZIO_COMPRESS_ZLE, lbuf, lbuf2)) {
 				*cfuncp = ZIO_COMPRESS_ZLE;
 				found = B_TRUE;
 				break;
 			}
 		}
 	}
 	umem_free(lbuf2, SPA_MAXBLOCKSIZE);
 
 	if (*cfuncp == ZIO_COMPRESS_ZLE) {
 		printf("\nZLE decompression was selected. If you "
 		    "suspect the results are wrong,\ntry avoiding ZLE "
 		    "by setting and exporting ZDB_NO_ZLE=\"true\"\n");
 	}
 
 	return (lsize > maxlsize ? -1 : lsize);
 }
 
 /*
  * Read a block from a pool and print it out.  The syntax of the
  * block descriptor is:
  *
  *	pool:vdev_specifier:offset:[lsize/]psize[:flags]
  *
  *	pool           - The name of the pool you wish to read from
  *	vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
  *	offset         - offset, in hex, in bytes
  *	size           - Amount of data to read, in hex, in bytes
  *	flags          - A string of characters specifying options
  *		 b: Decode a blkptr at given offset within block
  *		 c: Calculate and display checksums
  *		 d: Decompress data before dumping
  *		 e: Byteswap data before dumping
  *		 g: Display data as a gang block header
  *		 i: Display as an indirect block
  *		 r: Dump raw data to stdout
  *		 v: Verbose
  *
  */
 static void
 zdb_read_block(char *thing, spa_t *spa)
 {
 	blkptr_t blk, *bp = &blk;
 	dva_t *dva = bp->blk_dva;
 	int flags = 0;
 	uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0;
 	zio_t *zio;
 	vdev_t *vd;
 	abd_t *pabd;
 	void *lbuf, *buf;
 	char *s, *p, *dup, *flagstr, *sizes, *tmp = NULL;
 	const char *vdev, *errmsg = NULL;
 	int i, len, error;
 	boolean_t borrowed = B_FALSE, found = B_FALSE;
 
 	dup = strdup(thing);
 	s = strtok_r(dup, ":", &tmp);
 	vdev = s ?: "";
 	s = strtok_r(NULL, ":", &tmp);
 	offset = strtoull(s ? s : "", NULL, 16);
 	sizes = strtok_r(NULL, ":", &tmp);
 	s = strtok_r(NULL, ":", &tmp);
 	flagstr = strdup(s ?: "");
 
 	if (!zdb_parse_block_sizes(sizes, &lsize, &psize))
 		errmsg = "invalid size(s)";
 	if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE))
 		errmsg = "size must be a multiple of sector size";
 	if (!IS_P2ALIGNED(offset, DEV_BSIZE))
 		errmsg = "offset must be a multiple of sector size";
 	if (errmsg) {
 		(void) printf("Invalid block specifier: %s  - %s\n",
 		    thing, errmsg);
 		goto done;
 	}
 
 	tmp = NULL;
 	for (s = strtok_r(flagstr, ":", &tmp);
 	    s != NULL;
 	    s = strtok_r(NULL, ":", &tmp)) {
 		len = strlen(flagstr);
 		for (i = 0; i < len; i++) {
 			int bit = flagbits[(uchar_t)flagstr[i]];
 
 			if (bit == 0) {
 				(void) printf("***Ignoring flag: %c\n",
 				    (uchar_t)flagstr[i]);
 				continue;
 			}
 			found = B_TRUE;
 			flags |= bit;
 
 			p = &flagstr[i + 1];
 			if (*p != ':' && *p != '\0') {
 				int j = 0, nextbit = flagbits[(uchar_t)*p];
 				char *end, offstr[8] = { 0 };
 				if ((bit == ZDB_FLAG_PRINT_BLKPTR) &&
 				    (nextbit == 0)) {
 					/* look ahead to isolate the offset */
 					while (nextbit == 0 &&
 					    strchr(flagbitstr, *p) == NULL) {
 						offstr[j] = *p;
 						j++;
 						if (i + j > strlen(flagstr))
 							break;
 						p++;
 						nextbit = flagbits[(uchar_t)*p];
 					}
 					blkptr_offset = strtoull(offstr, &end,
 					    16);
 					i += j;
 				} else if (nextbit == 0) {
 					(void) printf("***Ignoring flag arg:"
 					    " '%c'\n", (uchar_t)*p);
 				}
 			}
 		}
 	}
 	if (blkptr_offset % sizeof (blkptr_t)) {
 		printf("Block pointer offset 0x%llx "
 		    "must be divisible by 0x%x\n",
 		    (longlong_t)blkptr_offset, (int)sizeof (blkptr_t));
 		goto done;
 	}
 	if (found == B_FALSE && strlen(flagstr) > 0) {
 		printf("Invalid flag arg: '%s'\n", flagstr);
 		goto done;
 	}
 
 	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
 	if (vd == NULL) {
 		(void) printf("***Invalid vdev: %s\n", vdev);
 		goto done;
 	} else {
 		if (vd->vdev_path)
 			(void) fprintf(stderr, "Found vdev: %s\n",
 			    vd->vdev_path);
 		else
 			(void) fprintf(stderr, "Found vdev type: %s\n",
 			    vd->vdev_ops->vdev_op_type);
 	}
 
 	pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
 	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 
 	BP_ZERO(bp);
 
 	DVA_SET_VDEV(&dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&dva[0], offset);
 	DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
 	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
 
 	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
 
 	BP_SET_LSIZE(bp, lsize);
 	BP_SET_PSIZE(bp, psize);
 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
 	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
 	BP_SET_TYPE(bp, DMU_OT_NONE);
 	BP_SET_LEVEL(bp, 0);
 	BP_SET_DEDUP(bp, 0);
 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	zio = zio_root(spa, NULL, NULL, 0);
 
 	if (vd == vd->vdev_top) {
 		/*
 		 * Treat this as a normal block read.
 		 */
 		zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
 		    ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
 	} else {
 		/*
 		 * Treat this as a vdev child I/O.
 		 */
 		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
 		    psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL,
 		    NULL, NULL));
 	}
 
 	error = zio_wait(zio);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (error) {
 		(void) printf("Read of %s failed, error: %d\n", thing, error);
 		goto out;
 	}
 
 	uint64_t orig_lsize = lsize;
 	buf = lbuf;
 	if (flags & ZDB_FLAG_DECOMPRESS) {
 		lsize = zdb_decompress_block(pabd, buf, lbuf,
 		    lsize, psize, flags);
 		if (lsize == -1) {
 			(void) printf("Decompress of %s failed\n", thing);
 			goto out;
 		}
 	} else {
 		buf = abd_borrow_buf_copy(pabd, lsize);
 		borrowed = B_TRUE;
 	}
 	/*
 	 * Try to detect invalid block pointer.  If invalid, try
 	 * decompressing.
 	 */
 	if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) &&
 	    !(flags & ZDB_FLAG_DECOMPRESS)) {
 		const blkptr_t *b = (const blkptr_t *)(void *)
 		    ((uintptr_t)buf + (uintptr_t)blkptr_offset);
 		if (zfs_blkptr_verify(spa, b,
 		    BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) {
 			abd_return_buf_copy(pabd, buf, lsize);
 			borrowed = B_FALSE;
 			buf = lbuf;
 			lsize = zdb_decompress_block(pabd, buf,
 			    lbuf, lsize, psize, flags);
 			b = (const blkptr_t *)(void *)
 			    ((uintptr_t)buf + (uintptr_t)blkptr_offset);
 			if (lsize == -1 || zfs_blkptr_verify(spa, b,
 			    BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) {
 				printf("invalid block pointer at this DVA\n");
 				goto out;
 			}
 		}
 	}
 
 	if (flags & ZDB_FLAG_PRINT_BLKPTR)
 		zdb_print_blkptr((blkptr_t *)(void *)
 		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
 	else if (flags & ZDB_FLAG_RAW)
 		zdb_dump_block_raw(buf, lsize, flags);
 	else if (flags & ZDB_FLAG_INDIRECT)
 		zdb_dump_indirect((blkptr_t *)buf,
 		    orig_lsize / sizeof (blkptr_t), flags);
 	else if (flags & ZDB_FLAG_GBH)
 		zdb_dump_gbh(buf, flags);
 	else
 		zdb_dump_block(thing, buf, lsize, flags);
 
 	/*
 	 * If :c was specified, iterate through the checksum table to
 	 * calculate and display each checksum for our specified
 	 * DVA and length.
 	 */
 	if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) &&
 	    !(flags & ZDB_FLAG_GBH)) {
 		zio_t *czio;
 		(void) printf("\n");
 		for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL;
 		    ck < ZIO_CHECKSUM_FUNCTIONS; ck++) {
 
 			if ((zio_checksum_table[ck].ci_flags &
 			    ZCHECKSUM_FLAG_EMBEDDED) ||
 			    ck == ZIO_CHECKSUM_NOPARITY) {
 				continue;
 			}
 			BP_SET_CHECKSUM(bp, ck);
 			spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 			czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 			if (vd == vd->vdev_top) {
 				zio_nowait(zio_read(czio, spa, bp, pabd, psize,
 				    NULL, NULL,
 				    ZIO_PRIORITY_SYNC_READ,
 				    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
 				    ZIO_FLAG_DONT_RETRY, NULL));
 			} else {
 				zio_nowait(zio_vdev_child_io(czio, bp, vd,
 				    offset, pabd, psize, ZIO_TYPE_READ,
 				    ZIO_PRIORITY_SYNC_READ,
 				    ZIO_FLAG_DONT_PROPAGATE |
 				    ZIO_FLAG_DONT_RETRY |
 				    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
 				    ZIO_FLAG_SPECULATIVE |
 				    ZIO_FLAG_OPTIONAL, NULL, NULL));
 			}
 			error = zio_wait(czio);
 			if (error == 0 || error == ECKSUM) {
 				zio_t *ck_zio = zio_null(NULL, spa, NULL,
 				    NULL, NULL, 0);
 				ck_zio->io_offset =
 				    DVA_GET_OFFSET(&bp->blk_dva[0]);
 				ck_zio->io_bp = bp;
 				zio_checksum_compute(ck_zio, ck, pabd, lsize);
 				printf(
 				    "%12s\t"
 				    "cksum=%016llx:%016llx:%016llx:%016llx\n",
 				    zio_checksum_table[ck].ci_name,
 				    (u_longlong_t)bp->blk_cksum.zc_word[0],
 				    (u_longlong_t)bp->blk_cksum.zc_word[1],
 				    (u_longlong_t)bp->blk_cksum.zc_word[2],
 				    (u_longlong_t)bp->blk_cksum.zc_word[3]);
 				zio_wait(ck_zio);
 			} else {
 				printf("error %d reading block\n", error);
 			}
 			spa_config_exit(spa, SCL_STATE, FTAG);
 		}
 	}
 
 	if (borrowed)
 		abd_return_buf_copy(pabd, buf, lsize);
 
 out:
 	abd_free(pabd);
 	umem_free(lbuf, SPA_MAXBLOCKSIZE);
 done:
 	free(flagstr);
 	free(dup);
 }
 
 static void
 zdb_embedded_block(char *thing)
 {
 	blkptr_t bp = {{{{0}}}};
 	unsigned long long *words = (void *)&bp;
 	char *buf;
 	int err;
 
 	err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
 	    "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
 	    words + 0, words + 1, words + 2, words + 3,
 	    words + 4, words + 5, words + 6, words + 7,
 	    words + 8, words + 9, words + 10, words + 11,
 	    words + 12, words + 13, words + 14, words + 15);
 	if (err != 16) {
 		(void) fprintf(stderr, "invalid input format\n");
 		zdb_exit(1);
 	}
 	ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
 	buf = malloc(SPA_MAXBLOCKSIZE);
 	if (buf == NULL) {
 		(void) fprintf(stderr, "out of memory\n");
 		zdb_exit(1);
 	}
 	err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
 	if (err != 0) {
 		(void) fprintf(stderr, "decode failed: %u\n", err);
 		zdb_exit(1);
 	}
 	zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
 	free(buf);
 }
 
 /* check for valid hex or decimal numeric string */
 static boolean_t
 zdb_numeric(char *str)
 {
 	int i = 0, len;
 
 	len = strlen(str);
 	if (len == 0)
 		return (B_FALSE);
 	if (strncmp(str, "0x", 2) == 0 || strncmp(str, "0X", 2) == 0)
 		i = 2;
 	for (; i < len; i++) {
 		if (!isxdigit(str[i]))
 			return (B_FALSE);
 	}
 	return (B_TRUE);
 }
 
 static int
 dummy_get_file_info(dmu_object_type_t bonustype, const void *data,
     zfs_file_info_t *zoi)
 {
 	(void) data, (void) zoi;
 
 	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
 		return (ENOENT);
 
 	(void) fprintf(stderr, "dummy_get_file_info: not implemented");
 	abort();
 }
 
 int
 main(int argc, char **argv)
 {
 	int c;
 	int dump_all = 1;
 	int verbose = 0;
 	int error = 0;
 	char **searchdirs = NULL;
 	int nsearch = 0;
 	char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN];
 	nvlist_t *policy = NULL;
 	uint64_t max_txg = UINT64_MAX;
 	int64_t objset_id = -1;
 	uint64_t object;
 	int flags = ZFS_IMPORT_MISSING_LOG;
 	int rewind = ZPOOL_NEVER_REWIND;
 	char *spa_config_path_env, *objset_str;
 	boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE;
 	nvlist_t *cfg = NULL;
 	struct sigaction action;
 	boolean_t force_import = B_FALSE;
 	boolean_t config_path_console = B_FALSE;
 	char pbuf[MAXPATHLEN];
 
 	dprintf_setup(&argc, argv);
 
 	/*
 	 * Set up signal handlers, so if we crash due to bad on-disk data we
 	 * can get more info. Unlike ztest, we don't bail out if we can't set
 	 * up signal handlers, because zdb is very useful without them.
 	 */
 	action.sa_handler = sig_handler;
 	sigemptyset(&action.sa_mask);
 	action.sa_flags = 0;
 	if (sigaction(SIGSEGV, &action, NULL) < 0) {
 		(void) fprintf(stderr, "zdb: cannot catch SIGSEGV: %s\n",
 		    strerror(errno));
 	}
 	if (sigaction(SIGABRT, &action, NULL) < 0) {
 		(void) fprintf(stderr, "zdb: cannot catch SIGABRT: %s\n",
 		    strerror(errno));
 	}
 
 	/*
 	 * If there is an environment variable SPA_CONFIG_PATH it overrides
 	 * default spa_config_path setting. If -U flag is specified it will
 	 * override this environment variable settings once again.
 	 */
 	spa_config_path_env = getenv("SPA_CONFIG_PATH");
 	if (spa_config_path_env != NULL)
 		spa_config_path = spa_config_path_env;
 
 	/*
 	 * For performance reasons, we set this tunable down. We do so before
 	 * the arg parsing section so that the user can override this value if
 	 * they choose.
 	 */
 	zfs_btree_verify_intensity = 3;
 
 	struct option long_options[] = {
 		{"ignore-assertions",	no_argument,		NULL, 'A'},
 		{"block-stats",		no_argument,		NULL, 'b'},
 		{"backup",		no_argument,		NULL, 'B'},
 		{"checksum",		no_argument,		NULL, 'c'},
 		{"config",		no_argument,		NULL, 'C'},
 		{"datasets",		no_argument,		NULL, 'd'},
 		{"dedup-stats",		no_argument,		NULL, 'D'},
 		{"exported",		no_argument,		NULL, 'e'},
 		{"embedded-block-pointer",	no_argument,	NULL, 'E'},
 		{"automatic-rewind",	no_argument,		NULL, 'F'},
 		{"dump-debug-msg",	no_argument,		NULL, 'G'},
 		{"history",		no_argument,		NULL, 'h'},
 		{"intent-logs",		no_argument,		NULL, 'i'},
 		{"inflight",		required_argument,	NULL, 'I'},
 		{"checkpointed-state",	no_argument,		NULL, 'k'},
 		{"key",			required_argument,	NULL, 'K'},
 		{"label",		no_argument,		NULL, 'l'},
 		{"disable-leak-tracking",	no_argument,	NULL, 'L'},
 		{"metaslabs",		no_argument,		NULL, 'm'},
 		{"metaslab-groups",	no_argument,		NULL, 'M'},
 		{"numeric",		no_argument,		NULL, 'N'},
 		{"option",		required_argument,	NULL, 'o'},
 		{"object-lookups",	no_argument,		NULL, 'O'},
 		{"path",		required_argument,	NULL, 'p'},
 		{"parseable",		no_argument,		NULL, 'P'},
 		{"skip-label",		no_argument,		NULL, 'q'},
 		{"copy-object",		no_argument,		NULL, 'r'},
 		{"read-block",		no_argument,		NULL, 'R'},
 		{"io-stats",		no_argument,		NULL, 's'},
 		{"simulate-dedup",	no_argument,		NULL, 'S'},
 		{"txg",			required_argument,	NULL, 't'},
 		{"brt-stats",		no_argument,		NULL, 'T'},
 		{"uberblock",		no_argument,		NULL, 'u'},
 		{"cachefile",		required_argument,	NULL, 'U'},
 		{"verbose",		no_argument,		NULL, 'v'},
 		{"verbatim",		no_argument,		NULL, 'V'},
 		{"dump-blocks",		required_argument,	NULL, 'x'},
 		{"extreme-rewind",	no_argument,		NULL, 'X'},
 		{"all-reconstruction",	no_argument,		NULL, 'Y'},
 		{"livelist",		no_argument,		NULL, 'y'},
 		{"zstd-headers",	no_argument,		NULL, 'Z'},
 		{0, 0, 0, 0}
 	};
 
 	while ((c = getopt_long(argc, argv,
 	    "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ",
 	    long_options, NULL)) != -1) {
 		switch (c) {
 		case 'b':
 		case 'B':
 		case 'c':
 		case 'C':
 		case 'd':
 		case 'D':
 		case 'E':
 		case 'G':
 		case 'h':
 		case 'i':
 		case 'l':
 		case 'm':
 		case 'M':
 		case 'N':
 		case 'O':
 		case 'r':
 		case 'R':
 		case 's':
 		case 'S':
 		case 'T':
 		case 'u':
 		case 'y':
 		case 'Z':
 			dump_opt[c]++;
 			dump_all = 0;
 			break;
 		case 'A':
 		case 'e':
 		case 'F':
 		case 'k':
 		case 'L':
 		case 'P':
 		case 'q':
 		case 'X':
 			dump_opt[c]++;
 			break;
 		case 'Y':
 			zfs_reconstruct_indirect_combinations_max = INT_MAX;
 			zfs_deadman_enabled = 0;
 			break;
 		/* NB: Sort single match options below. */
 		case 'I':
 			max_inflight_bytes = strtoull(optarg, NULL, 0);
 			if (max_inflight_bytes == 0) {
 				(void) fprintf(stderr, "maximum number "
 				    "of inflight bytes must be greater "
 				    "than 0\n");
 				usage();
 			}
 			break;
 		case 'K':
 			dump_opt[c]++;
 			key_material = strdup(optarg);
 			/* redact key material in process table */
 			while (*optarg != '\0') { *optarg++ = '*'; }
 			break;
 		case 'o':
 			error = set_global_var(optarg);
 			if (error != 0)
 				usage();
 			break;
 		case 'p':
 			if (searchdirs == NULL) {
 				searchdirs = umem_alloc(sizeof (char *),
 				    UMEM_NOFAIL);
 			} else {
 				char **tmp = umem_alloc((nsearch + 1) *
 				    sizeof (char *), UMEM_NOFAIL);
 				memcpy(tmp, searchdirs, nsearch *
 				    sizeof (char *));
 				umem_free(searchdirs,
 				    nsearch * sizeof (char *));
 				searchdirs = tmp;
 			}
 			searchdirs[nsearch++] = optarg;
 			break;
 		case 't':
 			max_txg = strtoull(optarg, NULL, 0);
 			if (max_txg < TXG_INITIAL) {
 				(void) fprintf(stderr, "incorrect txg "
 				    "specified: %s\n", optarg);
 				usage();
 			}
 			break;
 		case 'U':
 			config_path_console = B_TRUE;
 			spa_config_path = optarg;
 			if (spa_config_path[0] != '/') {
 				(void) fprintf(stderr,
 				    "cachefile must be an absolute path "
 				    "(i.e. start with a slash)\n");
 				usage();
 			}
 			break;
 		case 'v':
 			verbose++;
 			break;
 		case 'V':
 			flags = ZFS_IMPORT_VERBATIM;
 			break;
 		case 'x':
 			vn_dumpdir = optarg;
 			break;
 		default:
 			usage();
 			break;
 		}
 	}
 
 	if (!dump_opt['e'] && searchdirs != NULL) {
 		(void) fprintf(stderr, "-p option requires use of -e\n");
 		usage();
 	}
 #if defined(_LP64)
 	/*
 	 * ZDB does not typically re-read blocks; therefore limit the ARC
 	 * to 256 MB, which can be used entirely for metadata.
 	 */
 	zfs_arc_min = 2ULL << SPA_MAXBLOCKSHIFT;
 	zfs_arc_max = 256 * 1024 * 1024;
 #endif
 
 	/*
 	 * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
 	 * "zdb -b" uses traversal prefetch which uses async reads.
 	 * For good performance, let several of them be active at once.
 	 */
 	zfs_vdev_async_read_max_active = 10;
 
 	/*
 	 * Disable reference tracking for better performance.
 	 */
 	reference_tracking_enable = B_FALSE;
 
 	/*
 	 * Do not fail spa_load when spa_load_verify fails. This is needed
 	 * to load non-idle pools.
 	 */
 	spa_load_verify_dryrun = B_TRUE;
 
 	/*
 	 * ZDB should have ability to read spacemaps.
 	 */
 	spa_mode_readable_spacemaps = B_TRUE;
 
 	if (dump_all)
 		verbose = MAX(verbose, 1);
 
 	for (c = 0; c < 256; c++) {
 		if (dump_all && strchr("ABeEFkKlLNOPrRSXy", c) == NULL)
 			dump_opt[c] = 1;
 		if (dump_opt[c])
 			dump_opt[c] += verbose;
 	}
 
 	libspl_set_assert_ok((dump_opt['A'] == 1) || (dump_opt['A'] > 2));
 	zfs_recover = (dump_opt['A'] > 1);
 
 	argc -= optind;
 	argv += optind;
 	if (argc < 2 && dump_opt['R'])
 		usage();
 
 	target = argv[0];
 
 	/*
 	 * Automate cachefile
 	 */
 	if (!spa_config_path_env && !config_path_console && target &&
 	    libzfs_core_init() == 0) {
 		char *pname = strdup(target);
 		const char *value;
 		nvlist_t *pnvl = NULL;
 		nvlist_t *vnvl = NULL;
 
 		if (strpbrk(pname, "/@") != NULL)
 			*strpbrk(pname, "/@") = '\0';
 
 		if (pname && lzc_get_props(pname, &pnvl) == 0) {
 			if (nvlist_lookup_nvlist(pnvl, "cachefile",
 			    &vnvl) == 0) {
 				value = fnvlist_lookup_string(vnvl,
 				    ZPROP_VALUE);
 			} else {
 				value = "-";
 			}
 			strlcpy(pbuf, value, sizeof (pbuf));
 			if (pbuf[0] != '\0') {
 				if (pbuf[0] == '/') {
 					if (access(pbuf, F_OK) == 0)
 						spa_config_path = pbuf;
 					else
 						force_import = B_TRUE;
 				} else if ((strcmp(pbuf, "-") == 0 &&
 				    access(ZPOOL_CACHE, F_OK) != 0) ||
 				    strcmp(pbuf, "none") == 0) {
 					force_import = B_TRUE;
 				}
 			}
 			nvlist_free(vnvl);
 		}
 
 		free(pname);
 		nvlist_free(pnvl);
 		libzfs_core_fini();
 	}
 
 	dmu_objset_register_type(DMU_OST_ZFS, dummy_get_file_info);
 	kernel_init(SPA_MODE_READ);
 	kernel_init_done = B_TRUE;
 
 	if (dump_opt['E']) {
 		if (argc != 1)
 			usage();
 		zdb_embedded_block(argv[0]);
 		error = 0;
 		goto fini;
 	}
 
 	if (argc < 1) {
 		if (!dump_opt['e'] && dump_opt['C']) {
 			dump_cachefile(spa_config_path);
 			error = 0;
 			goto fini;
 		}
 		usage();
 	}
 
 	if (dump_opt['l']) {
 		error = dump_label(argv[0]);
 		goto fini;
 	}
 
 	if (dump_opt['X'] || dump_opt['F'])
 		rewind = ZPOOL_DO_REWIND |
 		    (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
 
 	/* -N implies -d */
 	if (dump_opt['N'] && dump_opt['d'] == 0)
 		dump_opt['d'] = dump_opt['N'];
 
 	if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
 	    nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 ||
 	    nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0)
 		fatal("internal error: %s", strerror(ENOMEM));
 
 	error = 0;
 
 	if (strpbrk(target, "/@") != NULL) {
 		size_t targetlen;
 
 		target_pool = strdup(target);
 		*strpbrk(target_pool, "/@") = '\0';
 
 		target_is_spa = B_FALSE;
 		targetlen = strlen(target);
 		if (targetlen && target[targetlen - 1] == '/')
 			target[targetlen - 1] = '\0';
 
 		/*
 		 * See if an objset ID was supplied (-d <pool>/<objset ID>).
 		 * To disambiguate tank/100, consider the 100 as objsetID
 		 * if -N was given, otherwise 100 is an objsetID iff
 		 * tank/100 as a named dataset fails on lookup.
 		 */
 		objset_str = strchr(target, '/');
 		if (objset_str && strlen(objset_str) > 1 &&
 		    zdb_numeric(objset_str + 1)) {
 			char *endptr;
 			errno = 0;
 			objset_str++;
 			objset_id = strtoull(objset_str, &endptr, 0);
 			/* dataset 0 is the same as opening the pool */
 			if (errno == 0 && endptr != objset_str &&
 			    objset_id != 0) {
 				if (dump_opt['N'])
 					dataset_lookup = B_TRUE;
 			}
 			/* normal dataset name not an objset ID */
 			if (endptr == objset_str) {
 				objset_id = -1;
 			}
 		} else if (objset_str && !zdb_numeric(objset_str + 1) &&
 		    dump_opt['N']) {
 			printf("Supply a numeric objset ID with -N\n");
 			error = 1;
 			goto fini;
 		}
 	} else {
 		target_pool = target;
 	}
 
 	if (dump_opt['e'] || force_import) {
 		importargs_t args = { 0 };
 
 		/*
 		 * If path is not provided, search in /dev
 		 */
 		if (searchdirs == NULL) {
 			searchdirs = umem_alloc(sizeof (char *), UMEM_NOFAIL);
 			searchdirs[nsearch++] = (char *)ZFS_DEVDIR;
 		}
 
 		args.paths = nsearch;
 		args.path = searchdirs;
 		args.can_be_active = B_TRUE;
 
 		libpc_handle_t lpch = {
 			.lpc_lib_handle = NULL,
 			.lpc_ops = &libzpool_config_ops,
 			.lpc_printerr = B_TRUE
 		};
 		error = zpool_find_config(&lpch, target_pool, &cfg, &args);
 
 		if (error == 0) {
 
 			if (nvlist_add_nvlist(cfg,
 			    ZPOOL_LOAD_POLICY, policy) != 0) {
 				fatal("can't open '%s': %s",
 				    target, strerror(ENOMEM));
 			}
 
 			if (dump_opt['C'] > 1) {
 				(void) printf("\nConfiguration for import:\n");
 				dump_nvlist(cfg, 8);
 			}
 
 			/*
 			 * Disable the activity check to allow examination of
 			 * active pools.
 			 */
 			error = spa_import(target_pool, cfg, NULL,
 			    flags | ZFS_IMPORT_SKIP_MMP);
 		}
 	}
 
 	if (searchdirs != NULL) {
 		umem_free(searchdirs, nsearch * sizeof (char *));
 		searchdirs = NULL;
 	}
 
 	/*
 	 * We need to make sure to process -O option or call
 	 * dump_path after the -e option has been processed,
 	 * which imports the pool to the namespace if it's
 	 * not in the cachefile.
 	 */
 	if (dump_opt['O']) {
 		if (argc != 2)
 			usage();
 		dump_opt['v'] = verbose + 3;
 		error = dump_path(argv[0], argv[1], NULL);
 		goto fini;
 	}
 
 	if (dump_opt['r']) {
 		target_is_spa = B_FALSE;
 		if (argc != 3)
 			usage();
 		dump_opt['v'] = verbose;
 		error = dump_path(argv[0], argv[1], &object);
 		if (error != 0)
 			fatal("internal error: %s", strerror(error));
 	}
 
 	/*
 	 * import_checkpointed_state makes the assumption that the
 	 * target pool that we pass it is already part of the spa
 	 * namespace. Because of that we need to make sure to call
 	 * it always after the -e option has been processed, which
 	 * imports the pool to the namespace if it's not in the
 	 * cachefile.
 	 */
 	char *checkpoint_pool = NULL;
 	char *checkpoint_target = NULL;
 	if (dump_opt['k']) {
 		checkpoint_pool = import_checkpointed_state(target, cfg,
 		    &checkpoint_target);
 
 		if (checkpoint_target != NULL)
 			target = checkpoint_target;
 	}
 
 	if (cfg != NULL) {
 		nvlist_free(cfg);
 		cfg = NULL;
 	}
 
 	if (target_pool != target)
 		free(target_pool);
 
 	if (error == 0) {
 		if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {
 			ASSERT(checkpoint_pool != NULL);
 			ASSERT(checkpoint_target == NULL);
 
 			error = spa_open(checkpoint_pool, &spa, FTAG);
 			if (error != 0) {
 				fatal("Tried to open pool \"%s\" but "
 				    "spa_open() failed with error %d\n",
 				    checkpoint_pool, error);
 			}
 
 		} else if (target_is_spa || dump_opt['R'] || dump_opt['B'] ||
 		    objset_id == 0) {
 			zdb_set_skip_mmp(target);
 			error = spa_open_rewind(target, &spa, FTAG, policy,
 			    NULL);
 			if (error) {
 				/*
 				 * If we're missing the log device then
 				 * try opening the pool after clearing the
 				 * log state.
 				 */
 				mutex_enter(&spa_namespace_lock);
 				if ((spa = spa_lookup(target)) != NULL &&
 				    spa->spa_log_state == SPA_LOG_MISSING) {
 					spa->spa_log_state = SPA_LOG_CLEAR;
 					error = 0;
 				}
 				mutex_exit(&spa_namespace_lock);
 
 				if (!error) {
 					error = spa_open_rewind(target, &spa,
 					    FTAG, policy, NULL);
 				}
 			}
 		} else if (strpbrk(target, "#") != NULL) {
 			dsl_pool_t *dp;
 			error = dsl_pool_hold(target, FTAG, &dp);
 			if (error != 0) {
 				fatal("can't dump '%s': %s", target,
 				    strerror(error));
 			}
 			error = dump_bookmark(dp, target, B_TRUE, verbose > 1);
 			dsl_pool_rele(dp, FTAG);
 			if (error != 0) {
 				fatal("can't dump '%s': %s", target,
 				    strerror(error));
 			}
 			goto fini;
 		} else {
 			target_pool = strdup(target);
 			if (strpbrk(target, "/@") != NULL)
 				*strpbrk(target_pool, "/@") = '\0';
 
 			zdb_set_skip_mmp(target);
 			/*
 			 * If -N was supplied, the user has indicated that
 			 * zdb -d <pool>/<objsetID> is in effect.  Otherwise
 			 * we first assume that the dataset string is the
 			 * dataset name.  If dmu_objset_hold fails with the
 			 * dataset string, and we have an objset_id, retry the
 			 * lookup with the objsetID.
 			 */
 			boolean_t retry = B_TRUE;
 retry_lookup:
 			if (dataset_lookup == B_TRUE) {
 				/*
 				 * Use the supplied id to get the name
 				 * for open_objset.
 				 */
 				error = spa_open(target_pool, &spa, FTAG);
 				if (error == 0) {
 					error = name_from_objset_id(spa,
 					    objset_id, dsname);
 					spa_close(spa, FTAG);
 					if (error == 0)
 						target = dsname;
 				}
 			}
 			if (error == 0) {
 				if (objset_id > 0 && retry) {
 					int err = dmu_objset_hold(target, FTAG,
 					    &os);
 					if (err) {
 						dataset_lookup = B_TRUE;
 						retry = B_FALSE;
 						goto retry_lookup;
 					} else {
 						dmu_objset_rele(os, FTAG);
 					}
 				}
 				error = open_objset(target, FTAG, &os);
 			}
 			if (error == 0)
 				spa = dmu_objset_spa(os);
 			free(target_pool);
 		}
 	}
 	nvlist_free(policy);
 
 	if (error)
 		fatal("can't open '%s': %s", target, strerror(error));
 
 	/*
 	 * Set the pool failure mode to panic in order to prevent the pool
 	 * from suspending.  A suspended I/O will have no way to resume and
 	 * can prevent the zdb(8) command from terminating as expected.
 	 */
 	if (spa != NULL)
 		spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
 
 	argv++;
 	argc--;
 	if (dump_opt['r']) {
 		error = zdb_copy_object(os, object, argv[1]);
 	} else if (!dump_opt['R']) {
 		flagbits['d'] = ZOR_FLAG_DIRECTORY;
 		flagbits['f'] = ZOR_FLAG_PLAIN_FILE;
 		flagbits['m'] = ZOR_FLAG_SPACE_MAP;
 		flagbits['z'] = ZOR_FLAG_ZAP;
 		flagbits['A'] = ZOR_FLAG_ALL_TYPES;
 
 		if (argc > 0 && dump_opt['d']) {
 			zopt_object_args = argc;
 			zopt_object_ranges = calloc(zopt_object_args,
 			    sizeof (zopt_object_range_t));
 			for (unsigned i = 0; i < zopt_object_args; i++) {
 				int err;
 				const char *msg = NULL;
 
 				err = parse_object_range(argv[i],
 				    &zopt_object_ranges[i], &msg);
 				if (err != 0)
 					fatal("Bad object or range: '%s': %s\n",
 					    argv[i], msg ?: "");
 			}
 		} else if (argc > 0 && dump_opt['m']) {
 			zopt_metaslab_args = argc;
 			zopt_metaslab = calloc(zopt_metaslab_args,
 			    sizeof (uint64_t));
 			for (unsigned i = 0; i < zopt_metaslab_args; i++) {
 				errno = 0;
 				zopt_metaslab[i] = strtoull(argv[i], NULL, 0);
 				if (zopt_metaslab[i] == 0 && errno != 0)
 					fatal("bad number %s: %s", argv[i],
 					    strerror(errno));
 			}
 		}
 		if (dump_opt['B']) {
 			dump_backup(target, objset_id,
 			    argc > 0 ? argv[0] : NULL);
 		} else if (os != NULL) {
 			dump_objset(os);
 		} else if (zopt_object_args > 0 && !dump_opt['m']) {
 			dump_objset(spa->spa_meta_objset);
 		} else {
 			dump_zpool(spa);
 		}
 	} else {
 		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
 		flagbits['c'] = ZDB_FLAG_CHECKSUM;
 		flagbits['d'] = ZDB_FLAG_DECOMPRESS;
 		flagbits['e'] = ZDB_FLAG_BSWAP;
 		flagbits['g'] = ZDB_FLAG_GBH;
 		flagbits['i'] = ZDB_FLAG_INDIRECT;
 		flagbits['r'] = ZDB_FLAG_RAW;
 		flagbits['v'] = ZDB_FLAG_VERBOSE;
 
 		for (int i = 0; i < argc; i++)
 			zdb_read_block(argv[i], spa);
 	}
 
 	if (dump_opt['k']) {
 		free(checkpoint_pool);
 		if (!target_is_spa)
 			free(checkpoint_target);
 	}
 
 fini:
 	if (spa != NULL)
 		zdb_ddt_cleanup(spa);
 
 	if (os != NULL) {
 		close_objset(os, FTAG);
 	} else if (spa != NULL) {
 		spa_close(spa, FTAG);
 	}
 
 	fuid_table_destroy();
 
 	dump_debug_buffer();
 
 	if (kernel_init_done)
 		kernel_fini();
 
 	return (error);
 }
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index eae543731224..9c35f27ff0b4 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -1,574 +1,574 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_IMPL_H
 #define	_SYS_METASLAB_IMPL_H
 
 #include <sys/metaslab.h>
 #include <sys/space_map.h>
 #include <sys/range_tree.h>
 #include <sys/vdev.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/multilist.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Metaslab allocation tracing record.
  */
 typedef struct metaslab_alloc_trace {
 	list_node_t			mat_list_node;
 	metaslab_group_t		*mat_mg;
 	metaslab_t			*mat_msp;
 	uint64_t			mat_size;
 	uint64_t			mat_weight;
 	uint32_t			mat_dva_id;
 	uint64_t			mat_offset;
 	int					mat_allocator;
 } metaslab_alloc_trace_t;
 
 /*
  * Used by the metaslab allocation tracing facility to indicate
  * error conditions. These errors are stored to the offset member
  * of the metaslab_alloc_trace_t record and displayed by mdb.
  */
 typedef enum trace_alloc_type {
 	TRACE_ALLOC_FAILURE	= -1ULL,
 	TRACE_TOO_SMALL		= -2ULL,
 	TRACE_FORCE_GANG	= -3ULL,
 	TRACE_NOT_ALLOCATABLE	= -4ULL,
 	TRACE_GROUP_FAILURE	= -5ULL,
 	TRACE_ENOSPC		= -6ULL,
 	TRACE_CONDENSING	= -7ULL,
 	TRACE_VDEV_ERROR	= -8ULL,
 	TRACE_DISABLED		= -9ULL,
 } trace_alloc_type_t;
 
 #define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
 #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
 #define	METASLAB_WEIGHT_CLAIM		(1ULL << 61)
 #define	METASLAB_WEIGHT_TYPE		(1ULL << 60)
 #define	METASLAB_ACTIVE_MASK		\
 	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \
 	METASLAB_WEIGHT_CLAIM)
 
 /*
  * The metaslab weight is used to encode the amount of free space in a
  * metaslab, such that the "best" metaslab appears first when sorting the
  * metaslabs by weight. The weight (and therefore the "best" metaslab) can
  * be determined in two different ways: by computing a weighted sum of all
  * the free space in the metaslab (a space based weight) or by counting only
  * the free segments of the largest size (a segment based weight). We prefer
  * the segment based weight because it reflects how the free space is
  * comprised, but we cannot always use it -- legacy pools do not have the
  * space map histogram information necessary to determine the largest
  * contiguous regions. Pools that have the space map histogram determine
  * the segment weight by looking at each bucket in the histogram and
  * determining the free space whose size in bytes is in the range:
  *	[2^i, 2^(i+1))
  * We then encode the largest index, i, that contains regions into the
  * segment-weighted value.
  *
  * Space-based weight:
  *
  *      64      56      48      40      32      24      16      8       0
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
  *      |PSC1|                  weighted-free space                     |
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
  *
  *	PS - indicates primary and secondary activation
  *	C - indicates activation for claimed block zio
  *	space - the fragmentation-weighted space
  *
  * Segment-based weight:
  *
  *      64      56      48      40      32      24      16      8       0
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
  *      |PSC0| idx|            count of segments in region              |
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
  *
  *	PS - indicates primary and secondary activation
  *	C - indicates activation for claimed block zio
  *	idx - index for the highest bucket in the histogram
  *	count - number of segments in the specified bucket
  */
 #define	WEIGHT_GET_ACTIVE(weight)		BF64_GET((weight), 61, 3)
 #define	WEIGHT_SET_ACTIVE(weight, x)		BF64_SET((weight), 61, 3, x)
 
 #define	WEIGHT_IS_SPACEBASED(weight)		\
 	((weight) == 0 || BF64_GET((weight), 60, 1))
 #define	WEIGHT_SET_SPACEBASED(weight)		BF64_SET((weight), 60, 1, 1)
 
 /*
  * These macros are only applicable to segment-based weighting.
  */
 #define	WEIGHT_GET_INDEX(weight)		BF64_GET((weight), 54, 6)
 #define	WEIGHT_SET_INDEX(weight, x)		BF64_SET((weight), 54, 6, x)
 #define	WEIGHT_GET_COUNT(weight)		BF64_GET((weight), 0, 54)
 #define	WEIGHT_SET_COUNT(weight, x)		BF64_SET((weight), 0, 54, x)
 
 /*
  * Per-allocator data structure.
  */
 typedef struct metaslab_class_allocator {
 	metaslab_group_t	*mca_rotor;
 	uint64_t		mca_aliquot;
 
 	/*
 	 * The allocation throttle works on a reservation system. Whenever
 	 * an asynchronous zio wants to perform an allocation it must
 	 * first reserve the number of blocks that it wants to allocate.
 	 * If there aren't sufficient slots available for the pending zio
 	 * then that I/O is throttled until more slots free up. The current
 	 * number of reserved allocations is maintained by the mca_alloc_slots
 	 * refcount. The mca_alloc_max_slots value determines the maximum
 	 * number of allocations that the system allows. Gang blocks are
 	 * allowed to reserve slots even if we've reached the maximum
 	 * number of allocations allowed.
 	 */
 	uint64_t		mca_alloc_max_slots;
 	zfs_refcount_t		mca_alloc_slots;
 } ____cacheline_aligned metaslab_class_allocator_t;
 
 /*
  * A metaslab class encompasses a category of allocatable top-level vdevs.
  * Each top-level vdev is associated with a metaslab group which defines
  * the allocatable region for that vdev. Examples of these categories include
  * "normal" for data block allocations (i.e. main pool allocations) or "log"
  * for allocations designated for intent log devices (i.e. slog devices).
  * When a block allocation is requested from the SPA it is associated with a
  * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging
  * to the class can be used to satisfy that request. Allocations are done
  * by traversing the metaslab groups that are linked off of the mca_rotor field.
  * This rotor points to the next metaslab group where allocations will be
  * attempted. Allocating a block is a 3 step process -- select the metaslab
  * group, select the metaslab, and then allocate the block. The metaslab
  * class defines the low-level block allocator that will be used as the
  * final step in allocation. These allocators are pluggable allowing each class
  * to use a block allocator that best suits that class.
  */
 struct metaslab_class {
 	kmutex_t		mc_lock;
 	spa_t			*mc_spa;
 	const metaslab_ops_t		*mc_ops;
 
 	/*
 	 * Track the number of metaslab groups that have been initialized
 	 * and can accept allocations. An initialized metaslab group is
 	 * one has been completely added to the config (i.e. we have
 	 * updated the MOS config and the space has been added to the pool).
 	 */
 	uint64_t		mc_groups;
 
 	/*
 	 * Toggle to enable/disable the allocation throttle.
 	 */
 	boolean_t		mc_alloc_throttle_enabled;
 
 	uint64_t		mc_alloc_groups; /* # of allocatable groups */
 
 	uint64_t		mc_alloc;	/* total allocated space */
 	uint64_t		mc_deferred;	/* total deferred frees */
 	uint64_t		mc_space;	/* total space (alloc + free) */
 	uint64_t		mc_dspace;	/* total deflated space */
-	uint64_t		mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+	uint64_t		mc_histogram[ZFS_RANGE_TREE_HISTOGRAM_SIZE];
 
 	/*
 	 * List of all loaded metaslabs in the class, sorted in order of most
 	 * recent use.
 	 */
 	multilist_t		mc_metaslab_txg_list;
 
 	metaslab_class_allocator_t	mc_allocator[];
 };
 
 /*
  * Per-allocator data structure.
  */
 typedef struct metaslab_group_allocator {
 	uint64_t	mga_cur_max_alloc_queue_depth;
 	zfs_refcount_t	mga_alloc_queue_depth;
 	metaslab_t	*mga_primary;
 	metaslab_t	*mga_secondary;
 } metaslab_group_allocator_t;
 
 /*
  * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
  * of a top-level vdev. They are linked together to form a circular linked
  * list and can belong to only one metaslab class. Metaslab groups may become
  * ineligible for allocations for a number of reasons such as limited free
  * space, fragmentation, or going offline. When this happens the allocator will
  * simply find the next metaslab group in the linked list and attempt
  * to allocate from that group instead.
  */
 struct metaslab_group {
 	kmutex_t		mg_lock;
 	avl_tree_t		mg_metaslab_tree;
 	uint64_t		mg_aliquot;
 	boolean_t		mg_allocatable;		/* can we allocate? */
 	uint64_t		mg_ms_ready;
 
 	/*
 	 * A metaslab group is considered to be initialized only after
 	 * we have updated the MOS config and added the space to the pool.
 	 * We only allow allocation attempts to a metaslab group if it
 	 * has been initialized.
 	 */
 	boolean_t		mg_initialized;
 
 	uint64_t		mg_free_capacity;	/* percentage free */
 	int64_t			mg_bias;
 	int64_t			mg_activation_count;
 	metaslab_class_t	*mg_class;
 	vdev_t			*mg_vd;
 	metaslab_group_t	*mg_prev;
 	metaslab_group_t	*mg_next;
 
 	/*
 	 * In order for the allocation throttle to function properly, we cannot
 	 * have too many IOs going to each disk by default; the throttle
 	 * operates by allocating more work to disks that finish quickly, so
 	 * allocating larger chunks to each disk reduces its effectiveness.
 	 * However, if the number of IOs going to each allocator is too small,
 	 * we will not perform proper aggregation at the vdev_queue layer,
 	 * also resulting in decreased performance. Therefore, we will use a
 	 * ramp-up strategy.
 	 *
 	 * Each allocator in each metaslab group has a current queue depth
 	 * (mg_alloc_queue_depth[allocator]) and a current max queue depth
 	 * (mga_cur_max_alloc_queue_depth[allocator]), and each metaslab group
 	 * has an absolute max queue depth (mg_max_alloc_queue_depth).  We
 	 * add IOs to an allocator until the mg_alloc_queue_depth for that
 	 * allocator hits the cur_max. Every time an IO completes for a given
 	 * allocator on a given metaslab group, we increment its cur_max until
 	 * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to
 	 * help protect against disks that decrease in performance over time.
 	 *
 	 * It's possible for an allocator to handle more allocations than
 	 * its max. This can occur when gang blocks are required or when other
 	 * groups are unable to handle their share of allocations.
 	 */
 	uint64_t		mg_max_alloc_queue_depth;
 
 	/*
 	 * A metalab group that can no longer allocate the minimum block
 	 * size will set mg_no_free_space. Once a metaslab group is out
 	 * of space then its share of work must be distributed to other
 	 * groups.
 	 */
 	boolean_t		mg_no_free_space;
 
 	uint64_t		mg_allocations;
 	uint64_t		mg_failed_allocations;
 	uint64_t		mg_fragmentation;
-	uint64_t		mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+	uint64_t		mg_histogram[ZFS_RANGE_TREE_HISTOGRAM_SIZE];
 
 	int			mg_ms_disabled;
 	boolean_t		mg_disabled_updating;
 	kmutex_t		mg_ms_disabled_lock;
 	kcondvar_t		mg_ms_disabled_cv;
 
 	int			mg_allocators;
 	metaslab_group_allocator_t	mg_allocator[];
 };
 
 /*
  * This value defines the number of elements in the ms_lbas array. The value
  * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
  * This is the equivalent of highbit(UINT64_MAX).
  */
 #define	MAX_LBAS	64
 
 /*
  * Each metaslab maintains a set of in-core trees to track metaslab
  * operations.  The in-core free tree (ms_allocatable) contains the list of
  * free segments which are eligible for allocation.  As blocks are
  * allocated, the allocated segments are removed from the ms_allocatable and
  * added to a per txg allocation tree (ms_allocating).  As blocks are
  * freed, they are added to the free tree (ms_freeing).  These trees
  * allow us to process all allocations and frees in syncing context
  * where it is safe to update the on-disk space maps.  An additional set
  * of in-core trees is maintained to track deferred frees
  * (ms_defer).  Once a block is freed it will move from the
  * ms_freed to the ms_defer tree.  A deferred free means that a block
  * has been freed but cannot be used by the pool until TXG_DEFER_SIZE
  * transactions groups later.  For example, a block that is freed in txg
  * 50 will not be available for reallocation until txg 52 (50 +
  * TXG_DEFER_SIZE).  This provides a safety net for uberblock rollback.
  * A pool could be safely rolled back TXG_DEFERS_SIZE transactions
  * groups and ensure that no block has been reallocated.
  *
  * The simplified transition diagram looks like this:
  *
  *
  *      ALLOCATE
  *         |
  *         V
  *    free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map)
  *         ^
  *         |                        ms_freeing <--- FREE
  *         |                             |
  *         |                             v
  *         |                         ms_freed
  *         |                             |
  *         +-------- ms_defer[2] <-------+-------> (write to space map)
  *
  *
  * Each metaslab's space is tracked in a single space map in the MOS,
  * which is only updated in syncing context.  Each time we sync a txg,
  * we append the allocs and frees from that txg to the space map.  The
  * pool space is only updated once all metaslabs have finished syncing.
  *
  * To load the in-core free tree we read the space map from disk.  This
  * object contains a series of alloc and free records that are combined
  * to make up the list of all free segments in this metaslab.  These
  * segments are represented in-core by the ms_allocatable and are stored
  * in an AVL tree.
  *
  * As the space map grows (as a result of the appends) it will
  * eventually become space-inefficient.  When the metaslab's in-core
  * free tree is zfs_condense_pct/100 times the size of the minimal
  * on-disk representation, we rewrite it in its minimized form.  If a
  * metaslab needs to condense then we must set the ms_condensing flag to
  * ensure that allocations are not performed on the metaslab that is
  * being written.
  */
 struct metaslab {
 	/*
 	 * This is the main lock of the metaslab and its purpose is to
 	 * coordinate our allocations and frees [e.g., metaslab_block_alloc(),
 	 * metaslab_free_concrete(), ..etc] with our various syncing
 	 * procedures [e.g., metaslab_sync(), metaslab_sync_done(), ..etc].
 	 *
 	 * The lock is also used during some miscellaneous operations like
 	 * using the metaslab's histogram for the metaslab group's histogram
 	 * aggregation, or marking the metaslab for initialization.
 	 */
 	kmutex_t	ms_lock;
 
 	/*
 	 * Acquired together with the ms_lock whenever we expect to
 	 * write to metaslab data on-disk (i.e flushing entries to
 	 * the metaslab's space map). It helps coordinate readers of
 	 * the metaslab's space map [see spa_vdev_remove_thread()]
 	 * with writers [see metaslab_sync() or metaslab_flush()].
 	 *
 	 * Note that metaslab_load(), even though a reader, uses
 	 * a completely different mechanism to deal with the reading
 	 * of the metaslab's space map based on ms_synced_length. That
 	 * said, the function still uses the ms_sync_lock after it
 	 * has read the ms_sm [see relevant comment in metaslab_load()
 	 * as to why].
 	 */
 	kmutex_t	ms_sync_lock;
 
 	kcondvar_t	ms_load_cv;
 	space_map_t	*ms_sm;
 	uint64_t	ms_id;
 	uint64_t	ms_start;
 	uint64_t	ms_size;
 	uint64_t	ms_fragmentation;
 
 	zfs_range_tree_t	*ms_allocating[TXG_SIZE];
 	zfs_range_tree_t	*ms_allocatable;
 	uint64_t	ms_allocated_this_txg;
 	uint64_t	ms_allocating_total;
 
 	/*
 	 * The following range trees are accessed only from syncing context.
 	 * ms_free*tree only have entries while syncing, and are empty
 	 * between syncs.
 	 */
 	zfs_range_tree_t	*ms_freeing;	/* to free this syncing txg */
 	/* already freed this syncing txg */
 	zfs_range_tree_t	*ms_freed;
 	zfs_range_tree_t	*ms_defer[TXG_DEFER_SIZE];
 	/* to add to the checkpoint */
 	zfs_range_tree_t	*ms_checkpointing;
 
 	/*
 	 * The ms_trim tree is the set of allocatable segments which are
 	 * eligible for trimming. (When the metaslab is loaded, it's a
 	 * subset of ms_allocatable.)  It's kept in-core as long as the
 	 * autotrim property is set and is not vacated when the metaslab
 	 * is unloaded.  Its purpose is to aggregate freed ranges to
 	 * facilitate efficient trimming.
 	 */
 	zfs_range_tree_t	*ms_trim;
 
 	boolean_t	ms_condensing;	/* condensing? */
 	boolean_t	ms_condense_wanted;
 
 	/*
 	 * The number of consumers which have disabled the metaslab.
 	 */
 	uint64_t	ms_disabled;
 
 	/*
 	 * We must always hold the ms_lock when modifying ms_loaded
 	 * and ms_loading.
 	 */
 	boolean_t	ms_loaded;
 	boolean_t	ms_loading;
 	kcondvar_t	ms_flush_cv;
 	boolean_t	ms_flushing;
 
 	/*
 	 * The following histograms count entries that are in the
 	 * metaslab's space map (and its histogram) but are not in
 	 * ms_allocatable yet, because they are in ms_freed, ms_freeing,
 	 * or ms_defer[].
 	 *
 	 * When the metaslab is not loaded, its ms_weight needs to
 	 * reflect what is allocatable (i.e. what will be part of
 	 * ms_allocatable if it is loaded).  The weight is computed from
 	 * the spacemap histogram, but that includes ranges that are
 	 * not yet allocatable (because they are in ms_freed,
 	 * ms_freeing, or ms_defer[]).  Therefore, when calculating the
 	 * weight, we need to remove those ranges.
 	 *
 	 * The ranges in the ms_freed and ms_defer[] range trees are all
 	 * present in the spacemap.  However, the spacemap may have
 	 * multiple entries to represent a contiguous range, because it
 	 * is written across multiple sync passes, but the changes of
 	 * all sync passes are consolidated into the range trees.
 	 * Adjacent ranges that are freed in different sync passes of
 	 * one txg will be represented separately (as 2 or more entries)
 	 * in the space map (and its histogram), but these adjacent
 	 * ranges will be consolidated (represented as one entry) in the
 	 * ms_freed/ms_defer[] range trees (and their histograms).
 	 *
 	 * When calculating the weight, we can not simply subtract the
 	 * range trees' histograms from the spacemap's histogram,
 	 * because the range trees' histograms may have entries in
 	 * higher buckets than the spacemap, due to consolidation.
 	 * Instead we must subtract the exact entries that were added to
 	 * the spacemap's histogram.  ms_synchist and ms_deferhist[]
 	 * represent these exact entries, so we can subtract them from
 	 * the spacemap's histogram when calculating ms_weight.
 	 *
 	 * ms_synchist represents the same ranges as ms_freeing +
 	 * ms_freed, but without consolidation across sync passes.
 	 *
 	 * ms_deferhist[i] represents the same ranges as ms_defer[i],
 	 * but without consolidation across sync passes.
 	 */
 	uint64_t	ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
 	uint64_t	ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
 
 	/*
 	 * Tracks the exact amount of allocated space of this metaslab
 	 * (and specifically the metaslab's space map) up to the most
 	 * recently completed sync pass [see usage in metaslab_sync()].
 	 */
 	uint64_t	ms_allocated_space;
 	int64_t		ms_deferspace;	/* sum of ms_defermap[] space	*/
 	uint64_t	ms_weight;	/* weight vs. others in group	*/
 	uint64_t	ms_activation_weight;	/* activation weight	*/
 
 	/*
 	 * Track of whenever a metaslab is selected for loading or allocation.
 	 * We use this value to determine how long the metaslab should
 	 * stay cached.
 	 */
 	uint64_t	ms_selected_txg;
 	/*
 	 * ms_load/unload_time can be used for performance monitoring
 	 * (e.g. by dtrace or mdb).
 	 */
 	hrtime_t	ms_load_time;	/* time last loaded */
 	hrtime_t	ms_unload_time;	/* time last unloaded */
 	hrtime_t	ms_selected_time; /* time last allocated from */
 
 	uint64_t	ms_alloc_txg;	/* last successful alloc (debug only) */
 	uint64_t	ms_max_size;	/* maximum allocatable size	*/
 
 	/*
 	 * -1 if it's not active in an allocator, otherwise set to the allocator
 	 * this metaslab is active for.
 	 */
 	int		ms_allocator;
 	boolean_t	ms_primary; /* Only valid if ms_allocator is not -1 */
 
 	/*
 	 * The metaslab block allocators can optionally use a size-ordered
 	 * range tree and/or an array of LBAs. Not all allocators use
 	 * this functionality. The ms_allocatable_by_size should always
 	 * contain the same number of segments as the ms_allocatable. The
 	 * only difference is that the ms_allocatable_by_size is ordered by
 	 * segment sizes.
 	 */
 	zfs_btree_t		ms_allocatable_by_size;
 	zfs_btree_t		ms_unflushed_frees_by_size;
 	uint64_t	ms_lbas[MAX_LBAS];
 
 	metaslab_group_t *ms_group;	/* metaslab group		*/
 	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
 	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/
 	avl_node_t	ms_spa_txg_node; /* node in spa_metaslabs_by_txg */
 	/*
 	 * Node in metaslab class's selected txg list
 	 */
 	multilist_node_t	ms_class_txg_node;
 
 	/*
 	 * Allocs and frees that are committed to the vdev log spacemap but
 	 * not yet to this metaslab's spacemap.
 	 */
 	zfs_range_tree_t	*ms_unflushed_allocs;
 	zfs_range_tree_t	*ms_unflushed_frees;
 
 	/*
 	 * We have flushed entries up to but not including this TXG. In
 	 * other words, all changes from this TXG and onward should not
 	 * be in this metaslab's space map and must be read from the
 	 * log space maps.
 	 */
 	uint64_t	ms_unflushed_txg;
 	boolean_t	ms_unflushed_dirty;
 
 	/* updated every time we are done syncing the metaslab's space map */
 	uint64_t	ms_synced_length;
 
 	boolean_t	ms_new;
 };
 
 typedef struct metaslab_unflushed_phys {
 	/* on-disk counterpart of ms_unflushed_txg */
 	uint64_t	msp_unflushed_txg;
 } metaslab_unflushed_phys_t;
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_METASLAB_IMPL_H */
diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h
index 4b0a3f2bfbb1..23eea3210c98 100644
--- a/include/sys/range_tree.h
+++ b/include/sys/range_tree.h
@@ -1,326 +1,326 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_RANGE_TREE_H
 #define	_SYS_RANGE_TREE_H
 
 #include <sys/btree.h>
 #include <sys/dmu.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
-#define	RANGE_TREE_HISTOGRAM_SIZE	64
+#define	ZFS_RANGE_TREE_HISTOGRAM_SIZE	64
 
 typedef struct zfs_range_tree_ops zfs_range_tree_ops_t;
 
 typedef enum zfs_range_seg_type {
 	ZFS_RANGE_SEG32,
 	ZFS_RANGE_SEG64,
 	ZFS_RANGE_SEG_GAP,
 	ZFS_RANGE_SEG_NUM_TYPES,
 } zfs_range_seg_type_t;
 
 /*
  * Note: the range_tree may not be accessed concurrently; consumers
  * must provide external locking if required.
  */
 typedef struct zfs_range_tree {
 	zfs_btree_t	rt_root;	/* offset-ordered segment b-tree */
 	uint64_t	rt_space;	/* sum of all segments in the map */
 	zfs_range_seg_type_t rt_type;	/* type of zfs_range_seg_t in use */
 	/*
 	 * All data that is stored in the range tree must have a start higher
 	 * than or equal to rt_start, and all sizes and offsets must be
 	 * multiples of 1 << rt_shift.
 	 */
 	uint8_t		rt_shift;
 	uint64_t	rt_start;
 	const zfs_range_tree_ops_t *rt_ops;
 	void		*rt_arg;
 	uint64_t	rt_gap;		/* allowable inter-segment gap */
 
 	/*
 	 * The rt_histogram maintains a histogram of ranges. Each bucket,
 	 * rt_histogram[i], contains the number of ranges whose size is:
 	 * 2^i <= size of range in bytes < 2^(i+1)
 	 */
-	uint64_t	rt_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+	uint64_t	rt_histogram[ZFS_RANGE_TREE_HISTOGRAM_SIZE];
 } zfs_range_tree_t;
 
-typedef struct range_seg32 {
+typedef struct zfs_range_seg32 {
 	uint32_t	rs_start;	/* starting offset of this segment */
 	uint32_t	rs_end;		/* ending offset (non-inclusive) */
-} range_seg32_t;
+} zfs_range_seg32_t;
 
 /*
  * Extremely large metaslabs, vdev-wide trees, and dnode-wide trees may
  * require 64-bit integers for ranges.
  */
-typedef struct range_seg64 {
+typedef struct zfs_range_seg64 {
 	uint64_t	rs_start;	/* starting offset of this segment */
 	uint64_t	rs_end;		/* ending offset (non-inclusive) */
-} range_seg64_t;
+} zfs_range_seg64_t;
 
-typedef struct range_seg_gap {
+typedef struct zfs_range_seg_gap {
 	uint64_t	rs_start;	/* starting offset of this segment */
 	uint64_t	rs_end;		/* ending offset (non-inclusive) */
 	uint64_t	rs_fill;	/* actual fill if gap mode is on */
-} range_seg_gap_t;
+} zfs_range_seg_gap_t;
 
 /*
  * This type needs to be the largest of the range segs, since it will be stack
  * allocated and then cast the actual type to do tree operations.
  */
-typedef range_seg_gap_t range_seg_max_t;
+typedef zfs_range_seg_gap_t zfs_range_seg_max_t;
 
 /*
  * This is just for clarity of code purposes, so we can make it clear that a
  * pointer is to a range seg of some type; when we need to do the actual math,
  * we'll figure out the real type.
  */
 typedef void zfs_range_seg_t;
 
 struct zfs_range_tree_ops {
 	void    (*rtop_create)(zfs_range_tree_t *rt, void *arg);
 	void    (*rtop_destroy)(zfs_range_tree_t *rt, void *arg);
 	void	(*rtop_add)(zfs_range_tree_t *rt, void *rs, void *arg);
 	void    (*rtop_remove)(zfs_range_tree_t *rt, void *rs, void *arg);
 	void	(*rtop_vacate)(zfs_range_tree_t *rt, void *arg);
 };
 
 static inline uint64_t
 zfs_rs_get_start_raw(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt)
 {
 	ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES);
 	switch (rt->rt_type) {
 	case ZFS_RANGE_SEG32:
-		return (((const range_seg32_t *)rs)->rs_start);
+		return (((const zfs_range_seg32_t *)rs)->rs_start);
 	case ZFS_RANGE_SEG64:
-		return (((const range_seg64_t *)rs)->rs_start);
+		return (((const zfs_range_seg64_t *)rs)->rs_start);
 	case ZFS_RANGE_SEG_GAP:
-		return (((const range_seg_gap_t *)rs)->rs_start);
+		return (((const zfs_range_seg_gap_t *)rs)->rs_start);
 	default:
 		VERIFY(0);
 		return (0);
 	}
 }
 
 static inline uint64_t
 zfs_rs_get_end_raw(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt)
 {
 	ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES);
 	switch (rt->rt_type) {
 	case ZFS_RANGE_SEG32:
-		return (((const range_seg32_t *)rs)->rs_end);
+		return (((const zfs_range_seg32_t *)rs)->rs_end);
 	case ZFS_RANGE_SEG64:
-		return (((const range_seg64_t *)rs)->rs_end);
+		return (((const zfs_range_seg64_t *)rs)->rs_end);
 	case ZFS_RANGE_SEG_GAP:
-		return (((const range_seg_gap_t *)rs)->rs_end);
+		return (((const zfs_range_seg_gap_t *)rs)->rs_end);
 	default:
 		VERIFY(0);
 		return (0);
 	}
 }
 
 static inline uint64_t
 zfs_rs_get_fill_raw(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt)
 {
 	ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES);
 	switch (rt->rt_type) {
 	case ZFS_RANGE_SEG32: {
-		const range_seg32_t *r32 = (const range_seg32_t *)rs;
+		const zfs_range_seg32_t *r32 = (const zfs_range_seg32_t *)rs;
 		return (r32->rs_end - r32->rs_start);
 	}
 	case ZFS_RANGE_SEG64: {
-		const range_seg64_t *r64 = (const range_seg64_t *)rs;
+		const zfs_range_seg64_t *r64 = (const zfs_range_seg64_t *)rs;
 		return (r64->rs_end - r64->rs_start);
 	}
 	case ZFS_RANGE_SEG_GAP:
-		return (((const range_seg_gap_t *)rs)->rs_fill);
+		return (((const zfs_range_seg_gap_t *)rs)->rs_fill);
 	default:
 		VERIFY(0);
 		return (0);
 	}
 
 }
 
 static inline uint64_t
 zfs_rs_get_start(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt)
 {
 	return ((zfs_rs_get_start_raw(rs, rt) << rt->rt_shift) + rt->rt_start);
 }
 
 static inline uint64_t
 zfs_rs_get_end(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt)
 {
 	return ((zfs_rs_get_end_raw(rs, rt) << rt->rt_shift) + rt->rt_start);
 }
 
 static inline uint64_t
 zfs_rs_get_fill(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt)
 {
 	return (zfs_rs_get_fill_raw(rs, rt) << rt->rt_shift);
 }
 
 static inline void
 zfs_rs_set_start_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t start)
 {
 	ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES);
 	switch (rt->rt_type) {
 	case ZFS_RANGE_SEG32:
 		ASSERT3U(start, <=, UINT32_MAX);
-		((range_seg32_t *)rs)->rs_start = (uint32_t)start;
+		((zfs_range_seg32_t *)rs)->rs_start = (uint32_t)start;
 		break;
 	case ZFS_RANGE_SEG64:
-		((range_seg64_t *)rs)->rs_start = start;
+		((zfs_range_seg64_t *)rs)->rs_start = start;
 		break;
 	case ZFS_RANGE_SEG_GAP:
-		((range_seg_gap_t *)rs)->rs_start = start;
+		((zfs_range_seg_gap_t *)rs)->rs_start = start;
 		break;
 	default:
 		VERIFY(0);
 	}
 }
 
 static inline void
 zfs_rs_set_end_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t end)
 {
 	ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES);
 	switch (rt->rt_type) {
 	case ZFS_RANGE_SEG32:
 		ASSERT3U(end, <=, UINT32_MAX);
-		((range_seg32_t *)rs)->rs_end = (uint32_t)end;
+		((zfs_range_seg32_t *)rs)->rs_end = (uint32_t)end;
 		break;
 	case ZFS_RANGE_SEG64:
-		((range_seg64_t *)rs)->rs_end = end;
+		((zfs_range_seg64_t *)rs)->rs_end = end;
 		break;
 	case ZFS_RANGE_SEG_GAP:
-		((range_seg_gap_t *)rs)->rs_end = end;
+		((zfs_range_seg_gap_t *)rs)->rs_end = end;
 		break;
 	default:
 		VERIFY(0);
 	}
 }
 
 static inline void
 zfs_zfs_rs_set_fill_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt,
     uint64_t fill)
 {
 	ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES);
 	switch (rt->rt_type) {
 	case ZFS_RANGE_SEG32:
 		/* fall through */
 	case ZFS_RANGE_SEG64:
 		ASSERT3U(fill, ==, zfs_rs_get_end_raw(rs, rt) -
 		    zfs_rs_get_start_raw(rs, rt));
 		break;
 	case ZFS_RANGE_SEG_GAP:
-		((range_seg_gap_t *)rs)->rs_fill = fill;
+		((zfs_range_seg_gap_t *)rs)->rs_fill = fill;
 		break;
 	default:
 		VERIFY(0);
 	}
 }
 
 static inline void
 zfs_rs_set_start(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t start)
 {
 	ASSERT3U(start, >=, rt->rt_start);
 	ASSERT(IS_P2ALIGNED(start, 1ULL << rt->rt_shift));
 	zfs_rs_set_start_raw(rs, rt, (start - rt->rt_start) >> rt->rt_shift);
 }
 
 static inline void
 zfs_rs_set_end(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t end)
 {
 	ASSERT3U(end, >=, rt->rt_start);
 	ASSERT(IS_P2ALIGNED(end, 1ULL << rt->rt_shift));
 	zfs_rs_set_end_raw(rs, rt, (end - rt->rt_start) >> rt->rt_shift);
 }
 
 static inline void
 zfs_rs_set_fill(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t fill)
 {
 	ASSERT(IS_P2ALIGNED(fill, 1ULL << rt->rt_shift));
 	zfs_zfs_rs_set_fill_raw(rs, rt, fill >> rt->rt_shift);
 }
 
 typedef void zfs_range_tree_func_t(void *arg, uint64_t start, uint64_t size);
 
 zfs_range_tree_t *zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
     uint64_t gap);
 zfs_range_tree_t *zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift);
 void zfs_range_tree_destroy(zfs_range_tree_t *rt);
 boolean_t zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start,
     uint64_t size);
 zfs_range_seg_t *zfs_range_tree_find(zfs_range_tree_t *rt, uint64_t start,
     uint64_t size);
 boolean_t zfs_range_tree_find_in(zfs_range_tree_t *rt, uint64_t start,
     uint64_t size, uint64_t *ostart, uint64_t *osize);
 void zfs_range_tree_verify_not_present(zfs_range_tree_t *rt,
     uint64_t start, uint64_t size);
 void zfs_range_tree_resize_segment(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
     uint64_t newstart, uint64_t newsize);
 uint64_t zfs_range_tree_space(zfs_range_tree_t *rt);
 uint64_t zfs_range_tree_numsegs(zfs_range_tree_t *rt);
 boolean_t zfs_range_tree_is_empty(zfs_range_tree_t *rt);
 void zfs_range_tree_swap(zfs_range_tree_t **rtsrc, zfs_range_tree_t **rtdst);
 void zfs_range_tree_stat_verify(zfs_range_tree_t *rt);
 uint64_t zfs_range_tree_min(zfs_range_tree_t *rt);
 uint64_t zfs_range_tree_max(zfs_range_tree_t *rt);
 uint64_t zfs_range_tree_span(zfs_range_tree_t *rt);
 
 void zfs_range_tree_add(void *arg, uint64_t start, uint64_t size);
 void zfs_range_tree_remove(void *arg, uint64_t start, uint64_t size);
 void zfs_range_tree_remove_fill(zfs_range_tree_t *rt, uint64_t start,
     uint64_t size);
 void zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
     int64_t delta);
 void zfs_range_tree_clear(zfs_range_tree_t *rt, uint64_t start, uint64_t size);
 
 void zfs_range_tree_vacate(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
     void *arg);
 void zfs_range_tree_walk(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
     void *arg);
 zfs_range_seg_t *zfs_range_tree_first(zfs_range_tree_t *rt);
 
 void zfs_range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
     zfs_range_tree_t *removefrom, zfs_range_tree_t *addto);
 void zfs_range_tree_remove_xor_add(zfs_range_tree_t *rt,
     zfs_range_tree_t *removefrom, zfs_range_tree_t *addto);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_RANGE_TREE_H */
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 38f62b07dc59..6ab7ac40bb07 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -1,232 +1,232 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Datto Inc. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_H
 #define	_SYS_VDEV_H
 
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu.h>
 #include <sys/space_map.h>
 #include <sys/metaslab.h>
 #include <sys/fs/zfs.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 typedef enum vdev_dtl_type {
 	DTL_MISSING,	/* 0% replication: no copies of the data */
 	DTL_PARTIAL,	/* less than 100% replication: some copies missing */
 	DTL_SCRUB,	/* unable to fully repair during scrub/resilver */
 	DTL_OUTAGE,	/* temporarily missing (used to attempt detach) */
 	DTL_TYPES
 } vdev_dtl_type_t;
 
 extern int zfs_nocacheflush;
 
 typedef boolean_t vdev_open_children_func_t(vdev_t *vd);
 
 extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
     __attribute__((format(printf, 2, 3)));
 extern void vdev_dbgmsg_print_tree(vdev_t *, int);
 extern int vdev_open(vdev_t *);
 extern void vdev_open_children(vdev_t *);
 extern void vdev_open_children_subset(vdev_t *, vdev_open_children_func_t *);
 extern int vdev_validate(vdev_t *);
 extern int vdev_copy_path_strict(vdev_t *, vdev_t *);
 extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *);
 extern void vdev_close(vdev_t *);
 extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
 extern void vdev_reopen(vdev_t *);
 extern int vdev_validate_aux(vdev_t *vd);
 extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
 extern boolean_t vdev_is_concrete(vdev_t *vd);
 extern boolean_t vdev_is_bootable(vdev_t *vd);
 extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
 extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
 extern int vdev_count_leaves(spa_t *spa);
 extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
     uint64_t txg, uint64_t size);
 extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
     uint64_t txg, uint64_t size);
 extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
 extern boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva,
     size_t psize, uint64_t phys_birth);
 extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva,
     size_t psize, uint64_t phys_birth);
 extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     boolean_t scrub_done, boolean_t rebuild_done);
 extern boolean_t vdev_dtl_required(vdev_t *vd);
 extern boolean_t vdev_resilver_needed(vdev_t *vd,
     uint64_t *minp, uint64_t *maxp);
 extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj,
     dmu_tx_t *tx);
 extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx);
 extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx);
 extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx);
 extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset,
     uint64_t size);
 extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev,
     uint64_t offset, uint64_t size, dmu_tx_t *tx);
 extern boolean_t vdev_replace_in_progress(vdev_t *vdev);
 
 extern void vdev_hold(vdev_t *);
 extern void vdev_rele(vdev_t *);
 
 extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
 extern void vdev_metaslab_fini(vdev_t *vd);
 extern void vdev_metaslab_set_size(vdev_t *);
 extern void vdev_expand(vdev_t *vd, uint64_t txg);
 extern void vdev_split(vdev_t *vd);
 extern void vdev_deadman(vdev_t *vd, const char *tag);
 
-typedef void vdev_xlate_func_t(void *arg, range_seg64_t *physical_rs);
+typedef void vdev_xlate_func_t(void *arg, zfs_range_seg64_t *physical_rs);
 
-extern boolean_t vdev_xlate_is_empty(range_seg64_t *rs);
-extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
-    range_seg64_t *physical_rs, range_seg64_t *remain_rs);
-extern void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
+extern boolean_t vdev_xlate_is_empty(zfs_range_seg64_t *rs);
+extern void vdev_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
+    zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs);
+extern void vdev_xlate_walk(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
     vdev_xlate_func_t *func, void *arg);
 
 extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
 
 extern metaslab_group_t *vdev_get_mg(vdev_t *vd, metaslab_class_t *mc);
 
 extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
 extern void vdev_clear_stats(vdev_t *vd);
 extern void vdev_stat_update(zio_t *zio, uint64_t psize);
 extern void vdev_scan_stat_init(vdev_t *vd);
 extern void vdev_propagate_state(vdev_t *vd);
 extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
     vdev_aux_t aux);
 extern boolean_t vdev_children_are_offline(vdev_t *vd);
 
 extern void vdev_space_update(vdev_t *vd,
     int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
 
 extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space);
 
 extern uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize,
     uint64_t txg);
 extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
 
 /*
  * Return the amount of space allocated for a gang block header.  Note that
  * since the physical birth txg is not provided, this must be constant for
  * a given vdev.  (e.g. raidz expansion can't change this)
  */
 static inline uint64_t
 vdev_gang_header_asize(vdev_t *vd)
 {
 	return (vdev_psize_to_asize_txg(vd, SPA_GANGBLOCKSIZE, 0));
 }
 
 extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
 extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
 extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
     vdev_state_t *);
 extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
 extern int vdev_remove_wanted(spa_t *spa, uint64_t guid);
 extern void vdev_clear(spa_t *spa, vdev_t *vd);
 
 extern boolean_t vdev_is_dead(vdev_t *vd);
 extern boolean_t vdev_readable(vdev_t *vd);
 extern boolean_t vdev_writeable(vdev_t *vd);
 extern boolean_t vdev_allocatable(vdev_t *vd);
 extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio);
 extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd);
 
 extern void vdev_queue_init(vdev_t *vd);
 extern void vdev_queue_fini(vdev_t *vd);
 extern zio_t *vdev_queue_io(zio_t *zio);
 extern void vdev_queue_io_done(zio_t *zio);
 extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
 
 extern uint32_t vdev_queue_length(vdev_t *vd);
 extern uint64_t vdev_queue_last_offset(vdev_t *vd);
 extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);
 
 extern void vdev_config_dirty(vdev_t *vd);
 extern void vdev_config_clean(vdev_t *vd);
 extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
 
 extern void vdev_state_dirty(vdev_t *vd);
 extern void vdev_state_clean(vdev_t *vd);
 
 extern void vdev_defer_resilver(vdev_t *vd);
 extern boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx);
 
 typedef enum vdev_config_flag {
 	VDEV_CONFIG_SPARE = 1 << 0,
 	VDEV_CONFIG_L2CACHE = 1 << 1,
 	VDEV_CONFIG_MOS = 1 << 2,
 	VDEV_CONFIG_MISSING = 1 << 3
 } vdev_config_flag_t;
 
 extern void vdev_post_kobj_evt(vdev_t *vd);
 extern void vdev_clear_kobj_evt(vdev_t *vd);
 extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
 extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
     boolean_t getstats, vdev_config_flag_t flags);
 
 /*
  * Label routines
  */
 struct uberblock;
 extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
 extern int vdev_label_number(uint64_t psise, uint64_t offset);
 extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg);
 extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **);
 extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv);
 extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t
     offset, uint64_t size, zio_done_func_t *done, void *priv, int flags);
 extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *);
 extern int vdev_label_write_bootenv(vdev_t *, nvlist_t *);
 extern int vdev_uberblock_sync_list(vdev_t **, int, struct uberblock *, int);
 extern int vdev_check_boot_reserve(spa_t *, vdev_t *);
 
 typedef enum {
 	VDEV_LABEL_CREATE,	/* create/add a new device */
 	VDEV_LABEL_REPLACE,	/* replace an existing device */
 	VDEV_LABEL_SPARE,	/* add a new hot spare */
 	VDEV_LABEL_REMOVE,	/* remove an existing device */
 	VDEV_LABEL_L2CACHE,	/* add an L2ARC cache device */
 	VDEV_LABEL_SPLIT	/* generating new label for split-off dev */
 } vdev_labeltype_t;
 
 extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
 
 extern int vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl);
 extern int vdev_prop_get(vdev_t *vd, nvlist_t *nvprops, nvlist_t *outnvl);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_VDEV_H */
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 6840ee78915e..315e2fc88410 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -1,671 +1,671 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2023, Klara Inc.
  */
 
 #ifndef _SYS_VDEV_IMPL_H
 #define	_SYS_VDEV_IMPL_H
 
 #include <sys/avl.h>
 #include <sys/bpobj.h>
 #include <sys/dmu.h>
 #include <sys/metaslab.h>
 #include <sys/nvpair.h>
 #include <sys/space_map.h>
 #include <sys/vdev.h>
 #include <sys/uberblock_impl.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/vdev_indirect_births.h>
 #include <sys/vdev_rebuild.h>
 #include <sys/vdev_removal.h>
 #include <sys/zfs_ratelimit.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Virtual device descriptors.
  *
  * All storage pool operations go through the virtual device framework,
  * which provides data replication and I/O scheduling.
  */
 
 /*
  * Forward declarations that lots of things need.
  */
 typedef struct vdev_queue vdev_queue_t;
 struct abd;
 
 extern uint_t zfs_vdev_queue_depth_pct;
 extern uint_t zfs_vdev_def_queue_depth;
 extern uint_t zfs_vdev_async_write_max_active;
 
 /*
  * Virtual device operations
  */
 typedef int	vdev_init_func_t(spa_t *spa, nvlist_t *nv, void **tsd);
 typedef void	vdev_kobj_post_evt_func_t(vdev_t *vd);
 typedef void	vdev_fini_func_t(vdev_t *vd);
 typedef int	vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
     uint64_t *ashift, uint64_t *pshift);
 typedef void	vdev_close_func_t(vdev_t *vd);
 typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize, uint64_t txg);
 typedef uint64_t vdev_min_asize_func_t(vdev_t *vd);
 typedef uint64_t vdev_min_alloc_func_t(vdev_t *vd);
 typedef void	vdev_io_start_func_t(zio_t *zio);
 typedef void	vdev_io_done_func_t(zio_t *zio);
 typedef void	vdev_state_change_func_t(vdev_t *vd, int, int);
 typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, const dva_t *dva,
     size_t psize, uint64_t phys_birth);
 typedef void	vdev_hold_func_t(vdev_t *vd);
 typedef void	vdev_rele_func_t(vdev_t *vd);
 
 typedef void	vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd,
     uint64_t offset, uint64_t size, void *arg);
 typedef void	vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size,
     vdev_remap_cb_t callback, void *arg);
 /*
  * Given a target vdev, translates the logical range "in" to the physical
  * range "res"
  */
-typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *logical,
-    range_seg64_t *physical, range_seg64_t *remain);
+typedef void vdev_xlation_func_t(vdev_t *cvd, const zfs_range_seg64_t *logical,
+    zfs_range_seg64_t *physical, zfs_range_seg64_t *remain);
 typedef uint64_t vdev_rebuild_asize_func_t(vdev_t *vd, uint64_t start,
     uint64_t size, uint64_t max_segment);
 typedef void vdev_metaslab_init_func_t(vdev_t *vd, uint64_t *startp,
     uint64_t *sizep);
 typedef void vdev_config_generate_func_t(vdev_t *vd, nvlist_t *nv);
 typedef uint64_t vdev_nparity_func_t(vdev_t *vd);
 typedef uint64_t vdev_ndisks_func_t(vdev_t *vd);
 
 typedef const struct vdev_ops {
 	vdev_init_func_t		*vdev_op_init;
 	vdev_fini_func_t		*vdev_op_fini;
 	vdev_open_func_t		*vdev_op_open;
 	vdev_close_func_t		*vdev_op_close;
 	vdev_asize_func_t		*vdev_op_asize;
 	vdev_min_asize_func_t		*vdev_op_min_asize;
 	vdev_min_alloc_func_t		*vdev_op_min_alloc;
 	vdev_io_start_func_t		*vdev_op_io_start;
 	vdev_io_done_func_t		*vdev_op_io_done;
 	vdev_state_change_func_t	*vdev_op_state_change;
 	vdev_need_resilver_func_t	*vdev_op_need_resilver;
 	vdev_hold_func_t		*vdev_op_hold;
 	vdev_rele_func_t		*vdev_op_rele;
 	vdev_remap_func_t		*vdev_op_remap;
 	vdev_xlation_func_t		*vdev_op_xlate;
 	vdev_rebuild_asize_func_t	*vdev_op_rebuild_asize;
 	vdev_metaslab_init_func_t	*vdev_op_metaslab_init;
 	vdev_config_generate_func_t	*vdev_op_config_generate;
 	vdev_nparity_func_t		*vdev_op_nparity;
 	vdev_ndisks_func_t		*vdev_op_ndisks;
 	vdev_kobj_post_evt_func_t	*vdev_op_kobj_evt_post;
 	char				vdev_op_type[16];
 	boolean_t			vdev_op_leaf;
 } vdev_ops_t;
 
 /*
  * Virtual device properties
  */
 typedef union vdev_queue_class {
 	struct {
 		ulong_t 	vqc_list_numnodes;
 		list_t		vqc_list;
 	};
 	avl_tree_t	vqc_tree;
 } vdev_queue_class_t;
 
 struct vdev_queue {
 	vdev_t		*vq_vdev;
 	vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
 	avl_tree_t	vq_read_offset_tree;
 	avl_tree_t	vq_write_offset_tree;
 	uint64_t	vq_last_offset;
 	zio_priority_t	vq_last_prio;	/* Last sent I/O priority. */
 	uint32_t	vq_cqueued;	/* Classes with queued I/Os. */
 	uint32_t	vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE];
 	uint32_t	vq_active;	/* Number of active I/Os. */
 	uint32_t	vq_ia_active;	/* Active interactive I/Os. */
 	uint32_t	vq_nia_credit;	/* Non-interactive I/Os credit. */
 	list_t		vq_active_list;	/* List of active I/Os. */
 	hrtime_t	vq_io_complete_ts; /* time last i/o completed */
 	hrtime_t	vq_io_delta_ts;
 	zio_t		vq_io_search; /* used as local for stack reduction */
 	kmutex_t	vq_lock;
 };
 
 typedef enum vdev_alloc_bias {
 	VDEV_BIAS_NONE,
 	VDEV_BIAS_LOG,		/* dedicated to ZIL data (SLOG) */
 	VDEV_BIAS_SPECIAL,	/* dedicated to ddt, metadata, and small blks */
 	VDEV_BIAS_DEDUP		/* dedicated to dedup metadata */
 } vdev_alloc_bias_t;
 
 
 /*
  * On-disk indirect vdev state.
  *
  * An indirect vdev is described exclusively in the MOS config of a pool.
  * The config for an indirect vdev includes several fields, which are
  * accessed in memory by a vdev_indirect_config_t.
  */
 typedef struct vdev_indirect_config {
 	/*
 	 * Object (in MOS) which contains the indirect mapping. This object
 	 * contains an array of vdev_indirect_mapping_entry_phys_t ordered by
 	 * vimep_src. The bonus buffer for this object is a
 	 * vdev_indirect_mapping_phys_t. This object is allocated when a vdev
 	 * removal is initiated.
 	 *
 	 * Note that this object can be empty if none of the data on the vdev
 	 * has been copied yet.
 	 */
 	uint64_t	vic_mapping_object;
 
 	/*
 	 * Object (in MOS) which contains the birth times for the mapping
 	 * entries. This object contains an array of
 	 * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus
 	 * buffer for this object is a vdev_indirect_birth_phys_t. This object
 	 * is allocated when a vdev removal is initiated.
 	 *
 	 * Note that this object can be empty if none of the vdev has yet been
 	 * copied.
 	 */
 	uint64_t	vic_births_object;
 
 	/*
 	 * This is the vdev ID which was removed previous to this vdev, or
 	 * UINT64_MAX if there are no previously removed vdevs.
 	 */
 	uint64_t	vic_prev_indirect_vdev;
 } vdev_indirect_config_t;
 
 /*
  * Virtual device descriptor
  */
 struct vdev {
 	/*
 	 * Common to all vdev types.
 	 */
 	uint64_t	vdev_id;	/* child number in vdev parent	*/
 	uint64_t	vdev_guid;	/* unique ID for this vdev	*/
 	uint64_t	vdev_guid_sum;	/* self guid + all child guids	*/
 	uint64_t	vdev_orig_guid;	/* orig. guid prior to remove	*/
 	uint64_t	vdev_asize;	/* allocatable device capacity	*/
 	uint64_t	vdev_min_asize;	/* min acceptable asize		*/
 	uint64_t	vdev_max_asize;	/* max acceptable asize		*/
 	uint64_t	vdev_ashift;	/* block alignment shift	*/
 
 	/*
 	 * Logical block alignment shift
 	 *
 	 * The smallest sized/aligned I/O supported by the device.
 	 */
 	uint64_t	vdev_logical_ashift;
 	/*
 	 * Physical block alignment shift
 	 *
 	 * The device supports logical I/Os with vdev_logical_ashift
 	 * size/alignment, but optimum performance will be achieved by
 	 * aligning/sizing requests to vdev_physical_ashift.  Smaller
 	 * requests may be inflated or incur device level read-modify-write
 	 * operations.
 	 *
 	 * May be 0 to indicate no preference (i.e. use vdev_logical_ashift).
 	 */
 	uint64_t	vdev_physical_ashift;
 	uint64_t	vdev_state;	/* see VDEV_STATE_* #defines	*/
 	uint64_t	vdev_prevstate;	/* used when reopening a vdev	*/
 	vdev_ops_t	*vdev_ops;	/* vdev operations		*/
 	spa_t		*vdev_spa;	/* spa for this vdev		*/
 	void		*vdev_tsd;	/* type-specific data		*/
 	vdev_t		*vdev_top;	/* top-level vdev		*/
 	vdev_t		*vdev_parent;	/* parent vdev			*/
 	vdev_t		**vdev_child;	/* array of children		*/
 	uint64_t	vdev_children;	/* number of children		*/
 	vdev_stat_t	vdev_stat;	/* virtual device statistics	*/
 	vdev_stat_ex_t	vdev_stat_ex;	/* extended statistics		*/
 	boolean_t	vdev_expanding;	/* expand the vdev?		*/
 	boolean_t	vdev_reopening;	/* reopen in progress?		*/
 	boolean_t	vdev_nonrot;	/* true if solid state		*/
 	int		vdev_load_error; /* error on last load		*/
 	int		vdev_open_error; /* error on last open		*/
 	int		vdev_validate_error; /* error on last validate	*/
 	kthread_t	*vdev_open_thread; /* thread opening children	*/
 	kthread_t	*vdev_validate_thread; /* thread validating children */
 	uint64_t	vdev_crtxg;	/* txg when top-level was added */
 	uint64_t	vdev_root_zap;
 
 	/*
 	 * Top-level vdev state.
 	 */
 	uint64_t	vdev_ms_array;	/* metaslab array object	*/
 	uint64_t	vdev_ms_shift;	/* metaslab size shift		*/
 	uint64_t	vdev_ms_count;	/* number of metaslabs		*/
 	metaslab_group_t *vdev_mg;	/* metaslab group		*/
 	metaslab_group_t *vdev_log_mg;	/* embedded slog metaslab group	*/
 	metaslab_t	**vdev_ms;	/* metaslab array		*/
 	txg_list_t	vdev_ms_list;	/* per-txg dirty metaslab lists	*/
 	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/
 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
 	boolean_t	vdev_remove_wanted; /* async remove wanted?	*/
 	boolean_t	vdev_fault_wanted; /* async faulted wanted?	*/
 	list_node_t	vdev_config_dirty_node; /* config dirty list	*/
 	list_node_t	vdev_state_dirty_node; /* state dirty list	*/
 	uint64_t	vdev_deflate_ratio; /* deflation ratio (x512)	*/
 	uint64_t	vdev_islog;	/* is an intent log device	*/
 	uint64_t	vdev_noalloc;	/* device is passivated?	*/
 	uint64_t	vdev_removing;	/* device is being removed?	*/
 	uint64_t	vdev_failfast;	/* device failfast setting	*/
 	boolean_t	vdev_rz_expanding; /* raidz is being expanded?	*/
 	boolean_t	vdev_ishole;	/* is a hole in the namespace	*/
 	uint64_t	vdev_top_zap;
 	vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias	*/
 
 	/* pool checkpoint related */
 	space_map_t	*vdev_checkpoint_sm;	/* contains reserved blocks */
 
 	/* Initialize related */
 	boolean_t	vdev_initialize_exit_wanted;
 	vdev_initializing_state_t	vdev_initialize_state;
 	list_node_t	vdev_initialize_node;
 	kthread_t	*vdev_initialize_thread;
 	/* Protects vdev_initialize_thread and vdev_initialize_state. */
 	kmutex_t	vdev_initialize_lock;
 	kcondvar_t	vdev_initialize_cv;
 	uint64_t	vdev_initialize_offset[TXG_SIZE];
 	uint64_t	vdev_initialize_last_offset;
 	/* valid while initializing */
 	zfs_range_tree_t	*vdev_initialize_tree;
 	uint64_t	vdev_initialize_bytes_est;
 	uint64_t	vdev_initialize_bytes_done;
 	uint64_t	vdev_initialize_action_time;	/* start and end time */
 
 	/* TRIM related */
 	boolean_t	vdev_trim_exit_wanted;
 	boolean_t	vdev_autotrim_exit_wanted;
 	vdev_trim_state_t	vdev_trim_state;
 	list_node_t	vdev_trim_node;
 	kmutex_t	vdev_autotrim_lock;
 	kcondvar_t	vdev_autotrim_cv;
 	kcondvar_t	vdev_autotrim_kick_cv;
 	kthread_t	*vdev_autotrim_thread;
 	/* Protects vdev_trim_thread and vdev_trim_state. */
 	kmutex_t	vdev_trim_lock;
 	kcondvar_t	vdev_trim_cv;
 	kthread_t	*vdev_trim_thread;
 	uint64_t	vdev_trim_offset[TXG_SIZE];
 	uint64_t	vdev_trim_last_offset;
 	uint64_t	vdev_trim_bytes_est;
 	uint64_t	vdev_trim_bytes_done;
 	uint64_t	vdev_trim_rate;		/* requested rate (bytes/sec) */
 	uint64_t	vdev_trim_partial;	/* requested partial TRIM */
 	uint64_t	vdev_trim_secure;	/* requested secure TRIM */
 	uint64_t	vdev_trim_action_time;	/* start and end time */
 
 	/* Rebuild related */
 	boolean_t	vdev_rebuilding;
 	boolean_t	vdev_rebuild_exit_wanted;
 	boolean_t	vdev_rebuild_cancel_wanted;
 	boolean_t	vdev_rebuild_reset_wanted;
 	kmutex_t	vdev_rebuild_lock;
 	kcondvar_t	vdev_rebuild_cv;
 	kthread_t	*vdev_rebuild_thread;
 	vdev_rebuild_t	vdev_rebuild_config;
 
 	/* For limiting outstanding I/Os (initialize, TRIM) */
 	kmutex_t	vdev_initialize_io_lock;
 	kcondvar_t	vdev_initialize_io_cv;
 	uint64_t	vdev_initialize_inflight;
 	kmutex_t	vdev_trim_io_lock;
 	kcondvar_t	vdev_trim_io_cv;
 	uint64_t	vdev_trim_inflight[3];
 
 	/*
 	 * Values stored in the config for an indirect or removing vdev.
 	 */
 	vdev_indirect_config_t	vdev_indirect_config;
 
 	/*
 	 * The vdev_indirect_rwlock protects the vdev_indirect_mapping
 	 * pointer from changing on indirect vdevs (when it is condensed).
 	 * Note that removing (not yet indirect) vdevs have different
 	 * access patterns (the mapping is not accessed from open context,
 	 * e.g. from zio_read) and locking strategy (e.g. svr_lock).
 	 */
 	krwlock_t vdev_indirect_rwlock;
 	vdev_indirect_mapping_t *vdev_indirect_mapping;
 	vdev_indirect_births_t *vdev_indirect_births;
 
 	/*
 	 * In memory data structures used to manage the obsolete sm, for
 	 * indirect or removing vdevs.
 	 *
 	 * The vdev_obsolete_segments is the in-core record of the segments
 	 * that are no longer referenced anywhere in the pool (due to
 	 * being freed or remapped and not referenced by any snapshots).
 	 * During a sync, segments are added to vdev_obsolete_segments
 	 * via vdev_indirect_mark_obsolete(); at the end of each sync
 	 * pass, this is appended to vdev_obsolete_sm via
 	 * vdev_indirect_sync_obsolete().  The vdev_obsolete_lock
 	 * protects against concurrent modifications of vdev_obsolete_segments
 	 * from multiple zio threads.
 	 */
 	kmutex_t	vdev_obsolete_lock;
 	zfs_range_tree_t	*vdev_obsolete_segments;
 	space_map_t	*vdev_obsolete_sm;
 
 	/*
 	 * Protects the vdev_scan_io_queue field itself as well as the
 	 * structure's contents (when present).
 	 */
 	kmutex_t			vdev_scan_io_queue_lock;
 	struct dsl_scan_io_queue	*vdev_scan_io_queue;
 
 	/*
 	 * Leaf vdev state.
 	 */
 	zfs_range_tree_t	*vdev_dtl[DTL_TYPES]; /* dirty time logs */
 	space_map_t	*vdev_dtl_sm;	/* dirty time log space map	*/
 	txg_node_t	vdev_dtl_node;	/* per-txg dirty DTL linkage	*/
 	uint64_t	vdev_dtl_object; /* DTL object			*/
 	uint64_t	vdev_psize;	/* physical device capacity	*/
 	uint64_t	vdev_wholedisk;	/* true if this is a whole disk */
 	uint64_t	vdev_offline;	/* persistent offline state	*/
 	uint64_t	vdev_faulted;	/* persistent faulted state	*/
 	uint64_t	vdev_degraded;	/* persistent degraded state	*/
 	uint64_t	vdev_removed;	/* persistent removed state	*/
 	uint64_t	vdev_resilver_txg; /* persistent resilvering state */
 	uint64_t	vdev_rebuild_txg; /* persistent rebuilding state */
 	char		*vdev_path;	/* vdev path (if any)		*/
 	char		*vdev_devid;	/* vdev devid (if any)		*/
 	char		*vdev_physpath;	/* vdev device path (if any)	*/
 	char		*vdev_enc_sysfs_path;	/* enclosure sysfs path */
 	char		*vdev_fru;	/* physical FRU location	*/
 	uint64_t	vdev_not_present; /* not present during import	*/
 	uint64_t	vdev_unspare;	/* unspare when resilvering done */
 	boolean_t	vdev_nowritecache; /* true if flushwritecache failed */
 	boolean_t	vdev_has_trim;	/* TRIM is supported		*/
 	boolean_t	vdev_has_securetrim; /* secure TRIM is supported */
 	boolean_t	vdev_checkremove; /* temporary online test	*/
 	boolean_t	vdev_forcefault; /* force online fault		*/
 	boolean_t	vdev_splitting;	/* split or repair in progress  */
 	boolean_t	vdev_delayed_close; /* delayed device close?	*/
 	boolean_t	vdev_tmpoffline; /* device taken offline temporarily? */
 	boolean_t	vdev_detached;	/* device detached?		*/
 	boolean_t	vdev_cant_read;	/* vdev is failing all reads	*/
 	boolean_t	vdev_cant_write; /* vdev is failing all writes	*/
 	boolean_t	vdev_isspare;	/* was a hot spare		*/
 	boolean_t	vdev_isl2cache;	/* was a l2cache device		*/
 	boolean_t	vdev_copy_uberblocks;  /* post expand copy uberblocks */
 	boolean_t	vdev_resilver_deferred;  /* resilver deferred */
 	boolean_t	vdev_kobj_flag; /* kobj event record */
 	boolean_t	vdev_attaching; /* vdev attach ashift handling */
 	vdev_queue_t	vdev_queue;	/* I/O deadline schedule queue	*/
 	spa_aux_vdev_t	*vdev_aux;	/* for l2cache and spares vdevs	*/
 	zio_t		*vdev_probe_zio; /* root of current probe	*/
 	vdev_aux_t	vdev_label_aux;	/* on-disk aux state		*/
 	uint64_t	vdev_leaf_zap;
 	hrtime_t	vdev_mmp_pending; /* 0 if write finished	*/
 	uint64_t	vdev_mmp_kstat_id;	/* to find kstat entry */
 	uint64_t	vdev_expansion_time;	/* vdev's last expansion time */
 	list_node_t	vdev_leaf_node;		/* leaf vdev list */
 
 	/*
 	 * For DTrace to work in userland (libzpool) context, these fields must
 	 * remain at the end of the structure.  DTrace will use the kernel's
 	 * CTF definition for 'struct vdev', and since the size of a kmutex_t is
 	 * larger in userland, the offsets for the rest of the fields would be
 	 * incorrect.
 	 */
 	kmutex_t	vdev_dtl_lock;	/* vdev_dtl_{map,resilver}	*/
 	kmutex_t	vdev_stat_lock;	/* vdev_stat			*/
 	kmutex_t	vdev_probe_lock; /* protects vdev_probe_zio	*/
 
 	/*
 	 * We rate limit ZIO delay, deadman, and checksum events, since they
 	 * can flood ZED with tons of events when a drive is acting up.
 	 *
 	 * We also rate limit Direct I/O write verify errors, since a user might
 	 * be continually manipulating a buffer that can flood ZED with tons of
 	 * events.
 	 */
 	zfs_ratelimit_t vdev_delay_rl;
 	zfs_ratelimit_t vdev_deadman_rl;
 	zfs_ratelimit_t vdev_dio_verify_rl;
 	zfs_ratelimit_t vdev_checksum_rl;
 
 	/*
 	 * Vdev properties for tuning ZED or zfsd
 	 */
 	uint64_t	vdev_checksum_n;
 	uint64_t	vdev_checksum_t;
 	uint64_t	vdev_io_n;
 	uint64_t	vdev_io_t;
 	uint64_t	vdev_slow_io_n;
 	uint64_t	vdev_slow_io_t;
 };
 
 #define	VDEV_PAD_SIZE		(8 << 10)
 /* 2 padding areas (vl_pad1 and vl_be) to skip */
 #define	VDEV_SKIP_SIZE		VDEV_PAD_SIZE * 2
 #define	VDEV_PHYS_SIZE		(112 << 10)
 #define	VDEV_UBERBLOCK_RING	(128 << 10)
 
 /*
  * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock
  * ring when MMP is enabled.
  */
 #define	MMP_BLOCKS_PER_LABEL	1
 
 /* The largest uberblock we support is 8k. */
 #define	MAX_UBERBLOCK_SHIFT (13)
 #define	VDEV_UBERBLOCK_SHIFT(vd)	\
 	MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
 	    MAX_UBERBLOCK_SHIFT)
 #define	VDEV_UBERBLOCK_COUNT(vd)	\
 	(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
 #define	VDEV_UBERBLOCK_OFFSET(vd, n)	\
 	offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
 #define	VDEV_UBERBLOCK_SIZE(vd)		(1ULL << VDEV_UBERBLOCK_SHIFT(vd))
 
 typedef struct vdev_phys {
 	char		vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
 	zio_eck_t	vp_zbt;
 } vdev_phys_t;
 
 typedef enum vbe_vers {
 	/*
 	 * The bootenv file is stored as ascii text in the envblock.
 	 * It is used by the GRUB bootloader used on Linux to store the
 	 * contents of the grubenv file. The file is stored as raw ASCII,
 	 * and is protected by an embedded checksum. By default, GRUB will
 	 * check if the boot filesystem supports storing the environment data
 	 * in a special location, and if so, will invoke filesystem specific
 	 * logic to retrieve it. This can be overridden by a variable, should
 	 * the user so desire.
 	 */
 	VB_RAW = 0,
 
 	/*
 	 * The bootenv file is converted to an nvlist and then packed into the
 	 * envblock.
 	 */
 	VB_NVLIST = 1
 } vbe_vers_t;
 
 typedef struct vdev_boot_envblock {
 	uint64_t	vbe_version;
 	char		vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) -
 			sizeof (zio_eck_t)];
 	zio_eck_t	vbe_zbt;
 } vdev_boot_envblock_t;
 _Static_assert(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE,
 	"vdev_boot_envblock_t wrong size");
 
 typedef struct vdev_label {
 	char		vl_pad1[VDEV_PAD_SIZE];			/*  8K */
 	vdev_boot_envblock_t	vl_be;				/*  8K */
 	vdev_phys_t	vl_vdev_phys;				/* 112K	*/
 	char		vl_uberblock[VDEV_UBERBLOCK_RING];	/* 128K	*/
 } vdev_label_t;						/* 256K total */
 
 /*
  * vdev_dirty() flags
  */
 #define	VDD_METASLAB	0x01
 #define	VDD_DTL		0x02
 
 /* Offset of embedded boot loader region on each label */
 #define	VDEV_BOOT_OFFSET	(2 * sizeof (vdev_label_t))
 /*
  * Size of embedded boot loader region on each label.
  * The total size of the first two labels plus the boot area is 4MB.
  * On RAIDZ, this space is overwritten during RAIDZ expansion.
  */
 #define	VDEV_BOOT_SIZE		(7ULL << 19)			/* 3.5M */
 
 /*
  * Size of label regions at the start and end of each leaf device.
  */
 #define	VDEV_LABEL_START_SIZE	(2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
 #define	VDEV_LABEL_END_SIZE	(2 * sizeof (vdev_label_t))
 #define	VDEV_LABELS		4
 #define	VDEV_BEST_LABEL		VDEV_LABELS
 #define	VDEV_OFFSET_IS_LABEL(vd, off)                           \
 	(((off) < VDEV_LABEL_START_SIZE) ||                     \
 	((off) >= ((vd)->vdev_psize - VDEV_LABEL_END_SIZE)))
 
 #define	VDEV_ALLOC_LOAD		0
 #define	VDEV_ALLOC_ADD		1
 #define	VDEV_ALLOC_SPARE	2
 #define	VDEV_ALLOC_L2CACHE	3
 #define	VDEV_ALLOC_ROOTPOOL	4
 #define	VDEV_ALLOC_SPLIT	5
 #define	VDEV_ALLOC_ATTACH	6
 
 /*
  * Allocate or free a vdev
  */
 extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid,
     vdev_ops_t *ops);
 extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
     vdev_t *parent, uint_t id, int alloctype);
 extern void vdev_free(vdev_t *vd);
 
 /*
  * Add or remove children and parents
  */
 extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd);
 extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd);
 extern void vdev_compact_children(vdev_t *pvd);
 extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops);
 extern void vdev_remove_parent(vdev_t *cvd);
 
 /*
  * vdev sync load and sync
  */
 extern boolean_t vdev_log_state_valid(vdev_t *vd);
 extern int vdev_load(vdev_t *vd);
 extern int vdev_dtl_load(vdev_t *vd);
 extern void vdev_sync(vdev_t *vd, uint64_t txg);
 extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
 extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
 extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg);
 
 /*
  * Available vdev types.
  */
 extern vdev_ops_t vdev_root_ops;
 extern vdev_ops_t vdev_mirror_ops;
 extern vdev_ops_t vdev_replacing_ops;
 extern vdev_ops_t vdev_raidz_ops;
 extern vdev_ops_t vdev_draid_ops;
 extern vdev_ops_t vdev_draid_spare_ops;
 extern vdev_ops_t vdev_disk_ops;
 extern vdev_ops_t vdev_file_ops;
 extern vdev_ops_t vdev_missing_ops;
 extern vdev_ops_t vdev_hole_ops;
 extern vdev_ops_t vdev_spare_ops;
 extern vdev_ops_t vdev_indirect_ops;
 
 /*
  * Common size functions
  */
-extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
-    range_seg64_t *physical_rs, range_seg64_t *remain_rs);
+extern void vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
+    zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs);
 extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg);
 extern uint64_t vdev_default_min_asize(vdev_t *vd);
 extern uint64_t vdev_get_min_asize(vdev_t *vd);
 extern void vdev_set_min_asize(vdev_t *vd);
 extern uint64_t vdev_get_min_alloc(vdev_t *vd);
 extern uint64_t vdev_get_nparity(vdev_t *vd);
 extern uint64_t vdev_get_ndisks(vdev_t *vd);
 
 /*
  * Global variables
  */
 extern int zfs_vdev_standard_sm_blksz;
 
 /*
  * Functions from vdev_indirect.c
  */
 extern void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx);
 extern boolean_t vdev_indirect_should_condense(vdev_t *vd);
 extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx);
 extern int vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj);
 extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise);
 
 /*
  * Other miscellaneous functions
  */
 int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj);
 void vdev_metaslab_group_create(vdev_t *vd);
 uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b);
 #if defined(__linux__)
 int param_get_raidz_impl(char *buf, zfs_kernel_param_t *kp);
 #endif
 int param_set_raidz_impl(ZFS_MODULE_PARAM_ARGS);
 
 /*
  * Vdev ashift optimization tunables
  */
 extern uint_t zfs_vdev_min_auto_ashift;
 extern uint_t zfs_vdev_max_auto_ashift;
 int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS);
 int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS);
 
 /*
  * VDEV checksum verification for Direct I/O writes
  */
 extern uint_t zfs_vdev_direct_write_verify;
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_VDEV_IMPL_H */
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index bc5c3cb9a670..5977f8c82b45 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -1,5352 +1,5352 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
  * Copyright 2016 Gary Mills
  * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
  * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/dsl_scan.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dnode.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/arc_impl.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/zfs_context.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zil_impl.h>
 #include <sys/zio_checksum.h>
 #include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/range_tree.h>
 #include <sys/dbuf.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
 
 /*
  * Grand theory statement on scan queue sorting
  *
  * Scanning is implemented by recursively traversing all indirection levels
  * in an object and reading all blocks referenced from said objects. This
  * results in us approximately traversing the object from lowest logical
  * offset to the highest. For best performance, we would want the logical
  * blocks to be physically contiguous. However, this is frequently not the
  * case with pools given the allocation patterns of copy-on-write filesystems.
  * So instead, we put the I/Os into a reordering queue and issue them in a
  * way that will most benefit physical disks (LBA-order).
  *
  * Queue management:
  *
  * Ideally, we would want to scan all metadata and queue up all block I/O
  * prior to starting to issue it, because that allows us to do an optimal
  * sorting job. This can however consume large amounts of memory. Therefore
  * we continuously monitor the size of the queues and constrain them to 5%
  * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this
  * limit, we clear out a few of the largest extents at the head of the queues
  * to make room for more scanning. Hopefully, these extents will be fairly
  * large and contiguous, allowing us to approach sequential I/O throughput
  * even without a fully sorted tree.
  *
  * Metadata scanning takes place in dsl_scan_visit(), which is called from
  * dsl_scan_sync() every spa_sync(). If we have either fully scanned all
  * metadata on the pool, or we need to make room in memory because our
  * queues are too large, dsl_scan_visit() is postponed and
  * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies
  * that metadata scanning and queued I/O issuing are mutually exclusive. This
  * allows us to provide maximum sequential I/O throughput for the majority of
  * I/O's issued since sequential I/O performance is significantly negatively
  * impacted if it is interleaved with random I/O.
  *
  * Implementation Notes
  *
  * One side effect of the queued scanning algorithm is that the scanning code
  * needs to be notified whenever a block is freed. This is needed to allow
  * the scanning code to remove these I/Os from the issuing queue. Additionally,
  * we do not attempt to queue gang blocks to be issued sequentially since this
  * is very hard to do and would have an extremely limited performance benefit.
  * Instead, we simply issue gang I/Os as soon as we find them using the legacy
  * algorithm.
  *
  * Backwards compatibility
  *
  * This new algorithm is backwards compatible with the legacy on-disk data
  * structures (and therefore does not require a new feature flag).
  * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan
  * will stop scanning metadata (in logical order) and wait for all outstanding
  * sorted I/O to complete. Once this is done, we write out a checkpoint
  * bookmark, indicating that we have scanned everything logically before it.
  * If the pool is imported on a machine without the new sorting algorithm,
  * the scan simply resumes from the last checkpoint using the legacy algorithm.
  */
 
 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
     const zbookmark_phys_t *);
 
 static scan_cb_t dsl_scan_scrub_cb;
 
 static int scan_ds_queue_compare(const void *a, const void *b);
 static int scan_prefetch_queue_compare(const void *a, const void *b);
 static void scan_ds_queue_clear(dsl_scan_t *scn);
 static void scan_ds_prefetch_queue_clear(dsl_scan_t *scn);
 static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
     uint64_t *txg);
 static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
 static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
 static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
 static uint64_t dsl_scan_count_data_disks(spa_t *spa);
 static void read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb);
 
 extern uint_t zfs_vdev_async_write_active_min_dirty_percent;
 static int zfs_scan_blkstats = 0;
 
 /*
  * 'zpool status' uses bytes processed per pass to report throughput and
  * estimate time remaining.  We define a pass to start when the scanning
  * phase completes for a sequential resilver.  Optionally, this value
  * may be used to reset the pass statistics every N txgs to provide an
  * estimated completion time based on currently observed performance.
  */
 static uint_t zfs_scan_report_txgs = 0;
 
 /*
  * By default zfs will check to ensure it is not over the hard memory
  * limit before each txg. If finer-grained control of this is needed
  * this value can be set to 1 to enable checking before scanning each
  * block.
  */
 static int zfs_scan_strict_mem_lim = B_FALSE;
 
 /*
  * Maximum number of parallelly executed bytes per leaf vdev. We attempt
  * to strike a balance here between keeping the vdev queues full of I/Os
  * at all times and not overflowing the queues to cause long latency,
  * which would cause long txg sync times. No matter what, we will not
  * overload the drives with I/O, since that is protected by
  * zfs_vdev_scrub_max_active.
  */
 static uint64_t zfs_scan_vdev_limit = 16 << 20;
 
 static uint_t zfs_scan_issue_strategy = 0;
 
 /* don't queue & sort zios, go direct */
 static int zfs_scan_legacy = B_FALSE;
 static uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */
 
 /*
  * fill_weight is non-tunable at runtime, so we copy it at module init from
  * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would
  * break queue sorting.
  */
 static uint_t zfs_scan_fill_weight = 3;
 static uint64_t fill_weight;
 
 /* See dsl_scan_should_clear() for details on the memory limit tunables */
 static const uint64_t zfs_scan_mem_lim_min = 16 << 20;	/* bytes */
 static const uint64_t zfs_scan_mem_lim_soft_max = 128 << 20;	/* bytes */
 
 
 /* fraction of physmem */
 static uint_t zfs_scan_mem_lim_fact = 20;
 
 /* fraction of mem lim above */
 static uint_t zfs_scan_mem_lim_soft_fact = 20;
 
 /* minimum milliseconds to scrub per txg */
 static uint_t zfs_scrub_min_time_ms = 1000;
 
 /* minimum milliseconds to obsolete per txg */
 static uint_t zfs_obsolete_min_time_ms = 500;
 
 /* minimum milliseconds to free per txg */
 static uint_t zfs_free_min_time_ms = 1000;
 
 /* minimum milliseconds to resilver per txg */
 static uint_t zfs_resilver_min_time_ms = 3000;
 
 static uint_t zfs_scan_checkpoint_intval = 7200; /* in seconds */
 int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */
 static int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
 static int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
 static const ddt_class_t zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
 /* max number of blocks to free in a single TXG */
 static uint64_t zfs_async_block_max_blocks = UINT64_MAX;
 /* max number of dedup blocks to free in a single TXG */
 static uint64_t zfs_max_async_dedup_frees = 100000;
 
 /* set to disable resilver deferring */
 static int zfs_resilver_disable_defer = B_FALSE;
 
 /* Don't defer a resilver if the one in progress only got this far: */
 static uint_t zfs_resilver_defer_percent = 10;
 
 /*
  * We wait a few txgs after importing a pool to begin scanning so that
  * the import / mounting code isn't held up by scrub / resilver IO.
  * Unfortunately, it is a bit difficult to determine exactly how long
  * this will take since userspace will trigger fs mounts asynchronously
  * and the kernel will create zvol minors asynchronously. As a result,
  * the value provided here is a bit arbitrary, but represents a
  * reasonable estimate of how many txgs it will take to finish fully
  * importing a pool
  */
 #define	SCAN_IMPORT_WAIT_TXGS 		5
 
 #define	DSL_SCAN_IS_SCRUB_RESILVER(scn) \
 	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
 	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
 
 #define	DSL_SCAN_IS_SCRUB(scn)		\
 	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB)
 
 /*
  * Enable/disable the processing of the free_bpobj object.
  */
 static int zfs_free_bpobj_enabled = 1;
 
 /* Error blocks to be scrubbed in one txg. */
 static uint_t zfs_scrub_error_blocks_per_txg = 1 << 12;
 
 /* the order has to match pool_scan_type */
 static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
 	NULL,
 	dsl_scan_scrub_cb,	/* POOL_SCAN_SCRUB */
 	dsl_scan_scrub_cb,	/* POOL_SCAN_RESILVER */
 };
 
 /* In core node for the scn->scn_queue. Represents a dataset to be scanned */
 typedef struct {
 	uint64_t	sds_dsobj;
 	uint64_t	sds_txg;
 	avl_node_t	sds_node;
 } scan_ds_t;
 
 /*
  * This controls what conditions are placed on dsl_scan_sync_state():
  * SYNC_OPTIONAL) write out scn_phys iff scn_queues_pending == 0
  * SYNC_MANDATORY) write out scn_phys always. scn_queues_pending must be 0.
  * SYNC_CACHED) if scn_queues_pending == 0, write out scn_phys. Otherwise
  *	write out the scn_phys_cached version.
  * See dsl_scan_sync_state for details.
  */
 typedef enum {
 	SYNC_OPTIONAL,
 	SYNC_MANDATORY,
 	SYNC_CACHED
 } state_sync_type_t;
 
 /*
  * This struct represents the minimum information needed to reconstruct a
  * zio for sequential scanning. This is useful because many of these will
  * accumulate in the sequential IO queues before being issued, so saving
  * memory matters here.
  */
 typedef struct scan_io {
 	/* fields from blkptr_t */
 	uint64_t		sio_blk_prop;
 	uint64_t		sio_phys_birth;
 	uint64_t		sio_birth;
 	zio_cksum_t		sio_cksum;
 	uint32_t		sio_nr_dvas;
 
 	/* fields from zio_t */
 	uint32_t		sio_flags;
 	zbookmark_phys_t	sio_zb;
 
 	/* members for queue sorting */
 	union {
 		avl_node_t	sio_addr_node; /* link into issuing queue */
 		list_node_t	sio_list_node; /* link for issuing to disk */
 	} sio_nodes;
 
 	/*
 	 * There may be up to SPA_DVAS_PER_BP DVAs here from the bp,
 	 * depending on how many were in the original bp. Only the
 	 * first DVA is really used for sorting and issuing purposes.
 	 * The other DVAs (if provided) simply exist so that the zio
 	 * layer can find additional copies to repair from in the
 	 * event of an error. This array must go at the end of the
 	 * struct to allow this for the variable number of elements.
 	 */
 	dva_t			sio_dva[];
 } scan_io_t;
 
 #define	SIO_SET_OFFSET(sio, x)		DVA_SET_OFFSET(&(sio)->sio_dva[0], x)
 #define	SIO_SET_ASIZE(sio, x)		DVA_SET_ASIZE(&(sio)->sio_dva[0], x)
 #define	SIO_GET_OFFSET(sio)		DVA_GET_OFFSET(&(sio)->sio_dva[0])
 #define	SIO_GET_ASIZE(sio)		DVA_GET_ASIZE(&(sio)->sio_dva[0])
 #define	SIO_GET_END_OFFSET(sio)		\
 	(SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio))
 #define	SIO_GET_MUSED(sio)		\
 	(sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t)))
 
 struct dsl_scan_io_queue {
 	dsl_scan_t	*q_scn; /* associated dsl_scan_t */
 	vdev_t		*q_vd; /* top-level vdev that this queue represents */
 	zio_t		*q_zio; /* scn_zio_root child for waiting on IO */
 
 	/* trees used for sorting I/Os and extents of I/Os */
 	zfs_range_tree_t	*q_exts_by_addr;
 	zfs_btree_t	q_exts_by_size;
 	avl_tree_t	q_sios_by_addr;
 	uint64_t	q_sio_memused;
 	uint64_t	q_last_ext_addr;
 
 	/* members for zio rate limiting */
 	uint64_t	q_maxinflight_bytes;
 	uint64_t	q_inflight_bytes;
 	kcondvar_t	q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */
 
 	/* per txg statistics */
 	uint64_t	q_total_seg_size_this_txg;
 	uint64_t	q_segs_this_txg;
 	uint64_t	q_total_zio_size_this_txg;
 	uint64_t	q_zios_this_txg;
 };
 
 /* private data for dsl_scan_prefetch_cb() */
 typedef struct scan_prefetch_ctx {
 	zfs_refcount_t spc_refcnt;	/* refcount for memory management */
 	dsl_scan_t *spc_scn;		/* dsl_scan_t for the pool */
 	boolean_t spc_root;		/* is this prefetch for an objset? */
 	uint8_t spc_indblkshift;	/* dn_indblkshift of current dnode */
 	uint16_t spc_datablkszsec;	/* dn_idatablkszsec of current dnode */
 } scan_prefetch_ctx_t;
 
 /* private data for dsl_scan_prefetch() */
 typedef struct scan_prefetch_issue_ctx {
 	avl_node_t spic_avl_node;	/* link into scn->scn_prefetch_queue */
 	scan_prefetch_ctx_t *spic_spc;	/* spc for the callback */
 	blkptr_t spic_bp;		/* bp to prefetch */
 	zbookmark_phys_t spic_zb;	/* bookmark to prefetch */
 } scan_prefetch_issue_ctx_t;
 
 static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
     const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue);
 static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
     scan_io_t *sio);
 
 static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
 static void scan_io_queues_destroy(dsl_scan_t *scn);
 
 static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP];
 
 /* sio->sio_nr_dvas must be set so we know which cache to free from */
 static void
 sio_free(scan_io_t *sio)
 {
 	ASSERT3U(sio->sio_nr_dvas, >, 0);
 	ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
 
 	kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio);
 }
 
 /* It is up to the caller to set sio->sio_nr_dvas for freeing */
 static scan_io_t *
 sio_alloc(unsigned short nr_dvas)
 {
 	ASSERT3U(nr_dvas, >, 0);
 	ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP);
 
 	return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP));
 }
 
 void
 scan_init(void)
 {
 	/*
 	 * This is used in ext_size_compare() to weight segments
 	 * based on how sparse they are. This cannot be changed
 	 * mid-scan and the tree comparison functions don't currently
 	 * have a mechanism for passing additional context to the
 	 * compare functions. Thus we store this value globally and
 	 * we only allow it to be set at module initialization time
 	 */
 	fill_weight = zfs_scan_fill_weight;
 
 	for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
 		char name[36];
 
 		(void) snprintf(name, sizeof (name), "sio_cache_%d", i);
 		sio_cache[i] = kmem_cache_create(name,
 		    (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))),
 		    0, NULL, NULL, NULL, NULL, NULL, 0);
 	}
 }
 
 void
 scan_fini(void)
 {
 	for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
 		kmem_cache_destroy(sio_cache[i]);
 	}
 }
 
 static inline boolean_t
 dsl_scan_is_running(const dsl_scan_t *scn)
 {
 	return (scn->scn_phys.scn_state == DSS_SCANNING);
 }
 
 boolean_t
 dsl_scan_resilvering(dsl_pool_t *dp)
 {
 	return (dsl_scan_is_running(dp->dp_scan) &&
 	    dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
 }
 
 static inline void
 sio2bp(const scan_io_t *sio, blkptr_t *bp)
 {
 	memset(bp, 0, sizeof (*bp));
 	bp->blk_prop = sio->sio_blk_prop;
 	BP_SET_PHYSICAL_BIRTH(bp, sio->sio_phys_birth);
 	BP_SET_LOGICAL_BIRTH(bp, sio->sio_birth);
 	bp->blk_fill = 1;	/* we always only work with data pointers */
 	bp->blk_cksum = sio->sio_cksum;
 
 	ASSERT3U(sio->sio_nr_dvas, >, 0);
 	ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
 
 	memcpy(bp->blk_dva, sio->sio_dva, sio->sio_nr_dvas * sizeof (dva_t));
 }
 
 static inline void
 bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
 {
 	sio->sio_blk_prop = bp->blk_prop;
 	sio->sio_phys_birth = BP_GET_PHYSICAL_BIRTH(bp);
 	sio->sio_birth = BP_GET_LOGICAL_BIRTH(bp);
 	sio->sio_cksum = bp->blk_cksum;
 	sio->sio_nr_dvas = BP_GET_NDVAS(bp);
 
 	/*
 	 * Copy the DVAs to the sio. We need all copies of the block so
 	 * that the self healing code can use the alternate copies if the
 	 * first is corrupted. We want the DVA at index dva_i to be first
 	 * in the sio since this is the primary one that we want to issue.
 	 */
 	for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) {
 		sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas];
 	}
 }
 
 int
 dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 {
 	int err;
 	dsl_scan_t *scn;
 	spa_t *spa = dp->dp_spa;
 	uint64_t f;
 
 	scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
 	scn->scn_dp = dp;
 
 	/*
 	 * It's possible that we're resuming a scan after a reboot so
 	 * make sure that the scan_async_destroying flag is initialized
 	 * appropriately.
 	 */
 	ASSERT(!scn->scn_async_destroying);
 	scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
 	    SPA_FEATURE_ASYNC_DESTROY);
 
 	/*
 	 * Calculate the max number of in-flight bytes for pool-wide
 	 * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
 	 * Limits for the issuing phase are done per top-level vdev and
 	 * are handled separately.
 	 */
 	scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
 	    zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));
 
 	avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
 	    offsetof(scan_ds_t, sds_node));
 	mutex_init(&scn->scn_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
 	    sizeof (scan_prefetch_issue_ctx_t),
 	    offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
 
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    "scrub_func", sizeof (uint64_t), 1, &f);
 	if (err == 0) {
 		/*
 		 * There was an old-style scrub in progress.  Restart a
 		 * new-style scrub from the beginning.
 		 */
 		scn->scn_restart_txg = txg;
 		zfs_dbgmsg("old-style scrub was in progress for %s; "
 		    "restarting new-style scrub in txg %llu",
 		    spa->spa_name,
 		    (longlong_t)scn->scn_restart_txg);
 
 		/*
 		 * Load the queue obj from the old location so that it
 		 * can be freed by dsl_scan_done().
 		 */
 		(void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    "scrub_queue", sizeof (uint64_t), 1,
 		    &scn->scn_phys.scn_queue_obj);
 	} else {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_ERRORSCRUB, sizeof (uint64_t),
 		    ERRORSCRUB_PHYS_NUMINTS, &scn->errorscrub_phys);
 
 		if (err != 0 && err != ENOENT)
 			return (err);
 
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 		    &scn->scn_phys);
 
 		/*
 		 * Detect if the pool contains the signature of #2094.  If it
 		 * does properly update the scn->scn_phys structure and notify
 		 * the administrator by setting an errata for the pool.
 		 */
 		if (err == EOVERFLOW) {
 			uint64_t zaptmp[SCAN_PHYS_NUMINTS + 1];
 			VERIFY3S(SCAN_PHYS_NUMINTS, ==, 24);
 			VERIFY3S(offsetof(dsl_scan_phys_t, scn_flags), ==,
 			    (23 * sizeof (uint64_t)));
 
 			err = zap_lookup(dp->dp_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN,
 			    sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1, &zaptmp);
 			if (err == 0) {
 				uint64_t overflow = zaptmp[SCAN_PHYS_NUMINTS];
 
 				if (overflow & ~DSL_SCAN_FLAGS_MASK ||
 				    scn->scn_async_destroying) {
 					spa->spa_errata =
 					    ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY;
 					return (EOVERFLOW);
 				}
 
 				memcpy(&scn->scn_phys, zaptmp,
 				    SCAN_PHYS_NUMINTS * sizeof (uint64_t));
 				scn->scn_phys.scn_flags = overflow;
 
 				/* Required scrub already in progress. */
 				if (scn->scn_phys.scn_state == DSS_FINISHED ||
 				    scn->scn_phys.scn_state == DSS_CANCELED)
 					spa->spa_errata =
 					    ZPOOL_ERRATA_ZOL_2094_SCRUB;
 			}
 		}
 
 		if (err == ENOENT)
 			return (0);
 		else if (err)
 			return (err);
 
 		/*
 		 * We might be restarting after a reboot, so jump the issued
 		 * counter to how far we've scanned. We know we're consistent
 		 * up to here.
 		 */
 		scn->scn_issued_before_pass = scn->scn_phys.scn_examined -
 		    scn->scn_phys.scn_skipped;
 
 		if (dsl_scan_is_running(scn) &&
 		    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
 			/*
 			 * A new-type scrub was in progress on an old
 			 * pool, and the pool was accessed by old
 			 * software.  Restart from the beginning, since
 			 * the old software may have changed the pool in
 			 * the meantime.
 			 */
 			scn->scn_restart_txg = txg;
 			zfs_dbgmsg("new-style scrub for %s was modified "
 			    "by old software; restarting in txg %llu",
 			    spa->spa_name,
 			    (longlong_t)scn->scn_restart_txg);
 		} else if (dsl_scan_resilvering(dp)) {
 			/*
 			 * If a resilver is in progress and there are already
 			 * errors, restart it instead of finishing this scan and
 			 * then restarting it. If there haven't been any errors
 			 * then remember that the incore DTL is valid.
 			 */
 			if (scn->scn_phys.scn_errors > 0) {
 				scn->scn_restart_txg = txg;
 				zfs_dbgmsg("resilver can't excise DTL_MISSING "
 				    "when finished; restarting on %s in txg "
 				    "%llu",
 				    spa->spa_name,
 				    (u_longlong_t)scn->scn_restart_txg);
 			} else {
 				/* it's safe to excise DTL when finished */
 				spa->spa_scrub_started = B_TRUE;
 			}
 		}
 	}
 
 	memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
 
 	/* reload the queue into the in-core state */
 	if (scn->scn_phys.scn_queue_obj != 0) {
 		zap_cursor_t zc;
 		zap_attribute_t *za = zap_attribute_alloc();
 
 		for (zap_cursor_init(&zc, dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj);
 		    zap_cursor_retrieve(&zc, za) == 0;
 		    (void) zap_cursor_advance(&zc)) {
 			scan_ds_queue_insert(scn,
 			    zfs_strtonum(za->za_name, NULL),
 			    za->za_first_integer);
 		}
 		zap_cursor_fini(&zc);
 		zap_attribute_free(za);
 	}
 
 	ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
 
 	spa_scan_stat_init(spa);
 	vdev_scan_stat_init(spa->spa_root_vdev);
 
 	return (0);
 }
 
 void
 dsl_scan_fini(dsl_pool_t *dp)
 {
 	if (dp->dp_scan != NULL) {
 		dsl_scan_t *scn = dp->dp_scan;
 
 		if (scn->scn_taskq != NULL)
 			taskq_destroy(scn->scn_taskq);
 
 		scan_ds_queue_clear(scn);
 		avl_destroy(&scn->scn_queue);
 		mutex_destroy(&scn->scn_queue_lock);
 		scan_ds_prefetch_queue_clear(scn);
 		avl_destroy(&scn->scn_prefetch_queue);
 
 		kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
 		dp->dp_scan = NULL;
 	}
 }
 
 static boolean_t
 dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	return (scn->scn_restart_txg != 0 &&
 	    scn->scn_restart_txg <= tx->tx_txg);
 }
 
 boolean_t
 dsl_scan_resilver_scheduled(dsl_pool_t *dp)
 {
 	return ((dp->dp_scan && dp->dp_scan->scn_restart_txg != 0) ||
 	    (spa_async_tasks(dp->dp_spa) & SPA_ASYNC_RESILVER));
 }
 
 boolean_t
 dsl_scan_scrubbing(const dsl_pool_t *dp)
 {
 	dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys;
 
 	return (scn_phys->scn_state == DSS_SCANNING &&
 	    scn_phys->scn_func == POOL_SCAN_SCRUB);
 }
 
 boolean_t
 dsl_errorscrubbing(const dsl_pool_t *dp)
 {
 	dsl_errorscrub_phys_t *errorscrub_phys = &dp->dp_scan->errorscrub_phys;
 
 	return (errorscrub_phys->dep_state == DSS_ERRORSCRUBBING &&
 	    errorscrub_phys->dep_func == POOL_SCAN_ERRORSCRUB);
 }
 
 boolean_t
 dsl_errorscrub_is_paused(const dsl_scan_t *scn)
 {
 	return (dsl_errorscrubbing(scn->scn_dp) &&
 	    scn->errorscrub_phys.dep_paused_flags);
 }
 
 boolean_t
 dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
 {
 	return (dsl_scan_scrubbing(scn->scn_dp) &&
 	    scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
 }
 
 static void
 dsl_errorscrub_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	scn->errorscrub_phys.dep_cursor =
 	    zap_cursor_serialize(&scn->errorscrub_cursor);
 
 	VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ERRORSCRUB, sizeof (uint64_t), ERRORSCRUB_PHYS_NUMINTS,
 	    &scn->errorscrub_phys, tx));
 }
 
 static void
 dsl_errorscrub_setup_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 	pool_scan_func_t *funcp = arg;
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 
 	ASSERT(!dsl_scan_is_running(scn));
 	ASSERT(!dsl_errorscrubbing(scn->scn_dp));
 	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
 
 	memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys));
 	scn->errorscrub_phys.dep_func = *funcp;
 	scn->errorscrub_phys.dep_state = DSS_ERRORSCRUBBING;
 	scn->errorscrub_phys.dep_start_time = gethrestime_sec();
 	scn->errorscrub_phys.dep_to_examine = spa_get_last_errlog_size(spa);
 	scn->errorscrub_phys.dep_examined = 0;
 	scn->errorscrub_phys.dep_errors = 0;
 	scn->errorscrub_phys.dep_cursor = 0;
 	zap_cursor_init_serialized(&scn->errorscrub_cursor,
 	    spa->spa_meta_objset, spa->spa_errlog_last,
 	    scn->errorscrub_phys.dep_cursor);
 
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_START);
 
 	dsl_errorscrub_sync_state(scn, tx);
 
 	spa_history_log_internal(spa, "error scrub setup", tx,
 	    "func=%u mintxg=%u maxtxg=%llu",
 	    *funcp, 0, (u_longlong_t)tx->tx_txg);
 }
 
 static int
 dsl_errorscrub_setup_check(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	if (dsl_scan_is_running(scn) || (dsl_errorscrubbing(scn->scn_dp))) {
 		return (SET_ERROR(EBUSY));
 	}
 
 	if (spa_get_last_errlog_size(scn->scn_dp->dp_spa) == 0) {
 		return (ECANCELED);
 	}
 	return (0);
 }
 
 /*
  * Writes out a persistent dsl_scan_phys_t record to the pool directory.
  * Because we can be running in the block sorting algorithm, we do not always
  * want to write out the record, only when it is "safe" to do so. This safety
  * condition is achieved by making sure that the sorting queues are empty
  * (scn_queues_pending == 0). When this condition is not true, the sync'd state
  * is inconsistent with how much actual scanning progress has been made. The
  * kind of sync to be performed is specified by the sync_type argument. If the
  * sync is optional, we only sync if the queues are empty. If the sync is
  * mandatory, we do a hard ASSERT to make sure that the queues are empty. The
  * third possible state is a "cached" sync. This is done in response to:
  * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been
  *	destroyed, so we wouldn't be able to restart scanning from it.
  * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been
  *	superseded by a newer snapshot.
  * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been
  *	swapped with its clone.
  * In all cases, a cached sync simply rewrites the last record we've written,
  * just slightly modified. For the modifications that are performed to the
  * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed,
  * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped.
  */
 static void
 dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
 {
 	int i;
 	spa_t *spa = scn->scn_dp->dp_spa;
 
 	ASSERT(sync_type != SYNC_MANDATORY || scn->scn_queues_pending == 0);
 	if (scn->scn_queues_pending == 0) {
 		for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
 			vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
 			dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue;
 
 			if (q == NULL)
 				continue;
 
 			mutex_enter(&vd->vdev_scan_io_queue_lock);
 			ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL);
 			ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==,
 			    NULL);
 			ASSERT3P(zfs_range_tree_first(q->q_exts_by_addr), ==,
 			    NULL);
 			mutex_exit(&vd->vdev_scan_io_queue_lock);
 		}
 
 		if (scn->scn_phys.scn_queue_obj != 0)
 			scan_ds_queue_sync(scn, tx);
 		VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 		    &scn->scn_phys, tx));
 		memcpy(&scn->scn_phys_cached, &scn->scn_phys,
 		    sizeof (scn->scn_phys));
 
 		if (scn->scn_checkpointing)
 			zfs_dbgmsg("finish scan checkpoint for %s",
 			    spa->spa_name);
 
 		scn->scn_checkpointing = B_FALSE;
 		scn->scn_last_checkpoint = ddi_get_lbolt();
 	} else if (sync_type == SYNC_CACHED) {
 		VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 		    &scn->scn_phys_cached, tx));
 	}
 }
 
 int
 dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
 
 	if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd) ||
 	    dsl_errorscrubbing(scn->scn_dp))
 		return (SET_ERROR(EBUSY));
 
 	return (0);
 }
 
 void
 dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 {
 	setup_sync_arg_t *setup_sync_arg = (setup_sync_arg_t *)arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 	dmu_object_type_t ot = 0;
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 
 	ASSERT(!dsl_scan_is_running(scn));
 	ASSERT3U(setup_sync_arg->func, >, POOL_SCAN_NONE);
 	ASSERT3U(setup_sync_arg->func, <, POOL_SCAN_FUNCS);
 	memset(&scn->scn_phys, 0, sizeof (scn->scn_phys));
 
 	/*
 	 * If we are starting a fresh scrub, we erase the error scrub
 	 * information from disk.
 	 */
 	memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys));
 	dsl_errorscrub_sync_state(scn, tx);
 
 	scn->scn_phys.scn_func = setup_sync_arg->func;
 	scn->scn_phys.scn_state = DSS_SCANNING;
 	scn->scn_phys.scn_min_txg = setup_sync_arg->txgstart;
 	if (setup_sync_arg->txgend == 0) {
 		scn->scn_phys.scn_max_txg = tx->tx_txg;
 	} else {
 		scn->scn_phys.scn_max_txg = setup_sync_arg->txgend;
 	}
 	scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
 	scn->scn_phys.scn_start_time = gethrestime_sec();
 	scn->scn_phys.scn_errors = 0;
 	scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
 	scn->scn_issued_before_pass = 0;
 	scn->scn_restart_txg = 0;
 	scn->scn_done_txg = 0;
 	scn->scn_last_checkpoint = 0;
 	scn->scn_checkpointing = B_FALSE;
 	spa_scan_stat_init(spa);
 	vdev_scan_stat_init(spa->spa_root_vdev);
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 		scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
 
 		/* rewrite all disk labels */
 		vdev_config_dirty(spa->spa_root_vdev);
 
 		if (vdev_resilver_needed(spa->spa_root_vdev,
 		    &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
 			nvlist_t *aux = fnvlist_alloc();
 			fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
 			    "healing");
 			spa_event_notify(spa, NULL, aux,
 			    ESC_ZFS_RESILVER_START);
 			nvlist_free(aux);
 		} else {
 			spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START);
 		}
 
 		spa->spa_scrub_started = B_TRUE;
 		/*
 		 * If this is an incremental scrub, limit the DDT scrub phase
 		 * to just the auto-ditto class (for correctness); the rest
 		 * of the scrub should go faster using top-down pruning.
 		 */
 		if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
 			scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
 
 		/*
 		 * When starting a resilver clear any existing rebuild state.
 		 * This is required to prevent stale rebuild status from
 		 * being reported when a rebuild is run, then a resilver and
 		 * finally a scrub.  In which case only the scrub status
 		 * should be reported by 'zpool status'.
 		 */
 		if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
 			vdev_t *rvd = spa->spa_root_vdev;
 			for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 				vdev_t *vd = rvd->vdev_child[i];
 				vdev_rebuild_clear_sync(
 				    (void *)(uintptr_t)vd->vdev_id, tx);
 			}
 		}
 	}
 
 	/* back to the generic stuff */
 
 	if (zfs_scan_blkstats) {
 		if (dp->dp_blkstats == NULL) {
 			dp->dp_blkstats =
 			    vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
 		}
 		memset(&dp->dp_blkstats->zab_type, 0,
 		    sizeof (dp->dp_blkstats->zab_type));
 	} else {
 		if (dp->dp_blkstats) {
 			vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 			dp->dp_blkstats = NULL;
 		}
 	}
 
 	if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
 		ot = DMU_OT_ZAP_OTHER;
 
 	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
 	    ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
 
 	memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
 
 	ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
 
 	dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
 
 	spa_history_log_internal(spa, "scan setup", tx,
 	    "func=%u mintxg=%llu maxtxg=%llu",
 	    setup_sync_arg->func, (u_longlong_t)scn->scn_phys.scn_min_txg,
 	    (u_longlong_t)scn->scn_phys.scn_max_txg);
 }
 
 /*
  * Called by ZFS_IOC_POOL_SCRUB and ZFS_IOC_POOL_SCAN ioctl to start a scrub,
  * error scrub or resilver. Can also be called to resume a paused scrub or
  * error scrub.
  */
 int
 dsl_scan(dsl_pool_t *dp, pool_scan_func_t func, uint64_t txgstart,
     uint64_t txgend)
 {
 	spa_t *spa = dp->dp_spa;
 	dsl_scan_t *scn = dp->dp_scan;
 	setup_sync_arg_t setup_sync_arg;
 
 	if (func != POOL_SCAN_SCRUB && (txgstart != 0 || txgend != 0)) {
 		return (EINVAL);
 	}
 
 	/*
 	 * Purge all vdev caches and probe all devices.  We do this here
 	 * rather than in sync context because this requires a writer lock
 	 * on the spa_config lock, which we can't do from sync context.  The
 	 * spa_scrub_reopen flag indicates that vdev_open() should not
 	 * attempt to start another scrub.
 	 */
 	spa_vdev_state_enter(spa, SCL_NONE);
 	spa->spa_scrub_reopen = B_TRUE;
 	vdev_reopen(spa->spa_root_vdev);
 	spa->spa_scrub_reopen = B_FALSE;
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	if (func == POOL_SCAN_RESILVER) {
 		dsl_scan_restart_resilver(spa->spa_dsl_pool, 0);
 		return (0);
 	}
 
 	if (func == POOL_SCAN_ERRORSCRUB) {
 		if (dsl_errorscrub_is_paused(dp->dp_scan)) {
 			/*
 			 * got error scrub start cmd, resume paused error scrub.
 			 */
 			int err = dsl_scrub_set_pause_resume(scn->scn_dp,
 			    POOL_SCRUB_NORMAL);
 			if (err == 0) {
 				spa_event_notify(spa, NULL, NULL,
 				    ESC_ZFS_ERRORSCRUB_RESUME);
 				return (ECANCELED);
 			}
 			return (SET_ERROR(err));
 		}
 
 		return (dsl_sync_task(spa_name(dp->dp_spa),
 		    dsl_errorscrub_setup_check, dsl_errorscrub_setup_sync,
 		    &func, 0, ZFS_SPACE_CHECK_RESERVED));
 	}
 
 	if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
 		/* got scrub start cmd, resume paused scrub */
 		int err = dsl_scrub_set_pause_resume(scn->scn_dp,
 		    POOL_SCRUB_NORMAL);
 		if (err == 0) {
 			spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME);
 			return (SET_ERROR(ECANCELED));
 		}
 		return (SET_ERROR(err));
 	}
 
 	setup_sync_arg.func = func;
 	setup_sync_arg.txgstart = txgstart;
 	setup_sync_arg.txgend = txgend;
 
 	return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
 	    dsl_scan_setup_sync, &setup_sync_arg, 0,
 	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 static void
 dsl_errorscrub_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 
 	if (complete) {
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_FINISH);
 		spa_history_log_internal(spa, "error scrub done", tx,
 		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 	} else {
 		spa_history_log_internal(spa, "error scrub canceled", tx,
 		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 	}
 
 	scn->errorscrub_phys.dep_state = complete ? DSS_FINISHED : DSS_CANCELED;
 	spa->spa_scrub_active = B_FALSE;
 	spa_errlog_rotate(spa);
 	scn->errorscrub_phys.dep_end_time = gethrestime_sec();
 	zap_cursor_fini(&scn->errorscrub_cursor);
 
 	if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB)
 		spa->spa_errata = 0;
 
 	ASSERT(!dsl_errorscrubbing(scn->scn_dp));
 }
 
 static void
 dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 {
 	static const char *old_names[] = {
 		"scrub_bookmark",
 		"scrub_ddt_bookmark",
 		"scrub_ddt_class_max",
 		"scrub_queue",
 		"scrub_min_txg",
 		"scrub_max_txg",
 		"scrub_func",
 		"scrub_errors",
 		NULL
 	};
 
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 	int i;
 
 	/* Remove any remnants of an old-style scrub. */
 	for (i = 0; old_names[i]; i++) {
 		(void) zap_remove(dp->dp_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
 	}
 
 	if (scn->scn_phys.scn_queue_obj != 0) {
 		VERIFY0(dmu_object_free(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, tx));
 		scn->scn_phys.scn_queue_obj = 0;
 	}
 	scan_ds_queue_clear(scn);
 	scan_ds_prefetch_queue_clear(scn);
 
 	scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
 
 	/*
 	 * If we were "restarted" from a stopped state, don't bother
 	 * with anything else.
 	 */
 	if (!dsl_scan_is_running(scn)) {
 		ASSERT(!scn->scn_is_sorted);
 		return;
 	}
 
 	if (scn->scn_is_sorted) {
 		scan_io_queues_destroy(scn);
 		scn->scn_is_sorted = B_FALSE;
 
 		if (scn->scn_taskq != NULL) {
 			taskq_destroy(scn->scn_taskq);
 			scn->scn_taskq = NULL;
 		}
 	}
 
 	scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED;
 
 	spa_notify_waiters(spa);
 
 	if (dsl_scan_restarting(scn, tx)) {
 		spa_history_log_internal(spa, "scan aborted, restarting", tx,
 		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 	} else if (!complete) {
 		spa_history_log_internal(spa, "scan cancelled", tx,
 		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 	} else {
 		spa_history_log_internal(spa, "scan done", tx,
 		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 		if (DSL_SCAN_IS_SCRUB(scn)) {
 			VERIFY0(zap_update(dp->dp_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT,
 			    DMU_POOL_LAST_SCRUBBED_TXG,
 			    sizeof (uint64_t), 1,
 			    &scn->scn_phys.scn_max_txg, tx));
 			spa->spa_scrubbed_last_txg = scn->scn_phys.scn_max_txg;
 		}
 	}
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 		spa->spa_scrub_active = B_FALSE;
 
 		/*
 		 * If the scrub/resilver completed, update all DTLs to
 		 * reflect this.  Whether it succeeded or not, vacate
 		 * all temporary scrub DTLs.
 		 *
 		 * As the scrub does not currently support traversing
 		 * data that have been freed but are part of a checkpoint,
 		 * we don't mark the scrub as done in the DTLs as faults
 		 * may still exist in those vdevs.
 		 */
 		if (complete &&
 		    !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 			vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
 			    scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE);
 
 			if (scn->scn_phys.scn_min_txg) {
 				nvlist_t *aux = fnvlist_alloc();
 				fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
 				    "healing");
 				spa_event_notify(spa, NULL, aux,
 				    ESC_ZFS_RESILVER_FINISH);
 				nvlist_free(aux);
 			} else {
 				spa_event_notify(spa, NULL, NULL,
 				    ESC_ZFS_SCRUB_FINISH);
 			}
 		} else {
 			vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
 			    0, B_TRUE, B_FALSE);
 		}
 		spa_errlog_rotate(spa);
 
 		/*
 		 * Don't clear flag until after vdev_dtl_reassess to ensure that
 		 * DTL_MISSING will get updated when possible.
 		 */
 		spa->spa_scrub_started = B_FALSE;
 
 		/*
 		 * We may have finished replacing a device.
 		 * Let the async thread assess this and handle the detach.
 		 */
 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 
 		/*
 		 * Clear any resilver_deferred flags in the config.
 		 * If there are drives that need resilvering, kick
 		 * off an asynchronous request to start resilver.
 		 * vdev_clear_resilver_deferred() may update the config
 		 * before the resilver can restart. In the event of
 		 * a crash during this period, the spa loading code
 		 * will find the drives that need to be resilvered
 		 * and start the resilver then.
 		 */
 		if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER) &&
 		    vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) {
 			spa_history_log_internal(spa,
 			    "starting deferred resilver", tx, "errors=%llu",
 			    (u_longlong_t)spa_approx_errlog_size(spa));
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 		}
 
 		/* Clear recent error events (i.e. duplicate events tracking) */
 		if (complete)
 			zfs_ereport_clear(spa, NULL);
 	}
 
 	scn->scn_phys.scn_end_time = gethrestime_sec();
 
 	if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB)
 		spa->spa_errata = 0;
 
 	ASSERT(!dsl_scan_is_running(scn));
 }
 
 static int
 dsl_errorscrub_pause_resume_check(void *arg, dmu_tx_t *tx)
 {
 	pool_scrub_cmd_t *cmd = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_scan_t *scn = dp->dp_scan;
 
 	if (*cmd == POOL_SCRUB_PAUSE) {
 		/*
 		 * can't pause a error scrub when there is no in-progress
 		 * error scrub.
 		 */
 		if (!dsl_errorscrubbing(dp))
 			return (SET_ERROR(ENOENT));
 
 		/* can't pause a paused error scrub */
 		if (dsl_errorscrub_is_paused(scn))
 			return (SET_ERROR(EBUSY));
 	} else if (*cmd != POOL_SCRUB_NORMAL) {
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	return (0);
 }
 
 static void
 dsl_errorscrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
 {
 	pool_scrub_cmd_t *cmd = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	spa_t *spa = dp->dp_spa;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	if (*cmd == POOL_SCRUB_PAUSE) {
 		spa->spa_scan_pass_errorscrub_pause = gethrestime_sec();
 		scn->errorscrub_phys.dep_paused_flags = B_TRUE;
 		dsl_errorscrub_sync_state(scn, tx);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_PAUSED);
 	} else {
 		ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
 		if (dsl_errorscrub_is_paused(scn)) {
 			/*
 			 * We need to keep track of how much time we spend
 			 * paused per pass so that we can adjust the error scrub
 			 * rate shown in the output of 'zpool status'.
 			 */
 			spa->spa_scan_pass_errorscrub_spent_paused +=
 			    gethrestime_sec() -
 			    spa->spa_scan_pass_errorscrub_pause;
 
 			spa->spa_scan_pass_errorscrub_pause = 0;
 			scn->errorscrub_phys.dep_paused_flags = B_FALSE;
 
 			zap_cursor_init_serialized(
 			    &scn->errorscrub_cursor,
 			    spa->spa_meta_objset, spa->spa_errlog_last,
 			    scn->errorscrub_phys.dep_cursor);
 
 			dsl_errorscrub_sync_state(scn, tx);
 		}
 	}
 }
 
 static int
 dsl_errorscrub_cancel_check(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 	/* can't cancel a error scrub when there is no one in-progress */
 	if (!dsl_errorscrubbing(scn->scn_dp))
 		return (SET_ERROR(ENOENT));
 	return (0);
 }
 
 static void
 dsl_errorscrub_cancel_sync(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	dsl_errorscrub_done(scn, B_FALSE, tx);
 	dsl_errorscrub_sync_state(scn, tx);
 	spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL,
 	    ESC_ZFS_ERRORSCRUB_ABORT);
 }
 
 static int
 dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	if (!dsl_scan_is_running(scn))
 		return (SET_ERROR(ENOENT));
 	return (0);
 }
 
 static void
 dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	dsl_scan_done(scn, B_FALSE, tx);
 	dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
 	spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT);
 }
 
 int
 dsl_scan_cancel(dsl_pool_t *dp)
 {
 	if (dsl_errorscrubbing(dp)) {
 		return (dsl_sync_task(spa_name(dp->dp_spa),
 		    dsl_errorscrub_cancel_check, dsl_errorscrub_cancel_sync,
 		    NULL, 3, ZFS_SPACE_CHECK_RESERVED));
 	}
 	return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
 	    dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
 }
 
 static int
 dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx)
 {
 	pool_scrub_cmd_t *cmd = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_scan_t *scn = dp->dp_scan;
 
 	if (*cmd == POOL_SCRUB_PAUSE) {
 		/* can't pause a scrub when there is no in-progress scrub */
 		if (!dsl_scan_scrubbing(dp))
 			return (SET_ERROR(ENOENT));
 
 		/* can't pause a paused scrub */
 		if (dsl_scan_is_paused_scrub(scn))
 			return (SET_ERROR(EBUSY));
 	} else if (*cmd != POOL_SCRUB_NORMAL) {
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	return (0);
 }
 
 static void
 dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
 {
 	pool_scrub_cmd_t *cmd = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	spa_t *spa = dp->dp_spa;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	if (*cmd == POOL_SCRUB_PAUSE) {
 		/* can't pause a scrub when there is no in-progress scrub */
 		spa->spa_scan_pass_scrub_pause = gethrestime_sec();
 		scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
 		scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED;
 		dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED);
 		spa_notify_waiters(spa);
 	} else {
 		ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
 		if (dsl_scan_is_paused_scrub(scn)) {
 			/*
 			 * We need to keep track of how much time we spend
 			 * paused per pass so that we can adjust the scrub rate
 			 * shown in the output of 'zpool status'
 			 */
 			spa->spa_scan_pass_scrub_spent_paused +=
 			    gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
 			spa->spa_scan_pass_scrub_pause = 0;
 			scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
 			scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED;
 			dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 		}
 	}
 }
 
 /*
  * Set scrub pause/resume state if it makes sense to do so
  */
 int
 dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
 {
 	if (dsl_errorscrubbing(dp)) {
 		return (dsl_sync_task(spa_name(dp->dp_spa),
 		    dsl_errorscrub_pause_resume_check,
 		    dsl_errorscrub_pause_resume_sync, &cmd, 3,
 		    ZFS_SPACE_CHECK_RESERVED));
 	}
 	return (dsl_sync_task(spa_name(dp->dp_spa),
 	    dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
 	    ZFS_SPACE_CHECK_RESERVED));
 }
 
 
 /* start a new scan, or restart an existing one. */
 void
 dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg)
 {
 	if (txg == 0) {
 		dmu_tx_t *tx;
 		tx = dmu_tx_create_dd(dp->dp_mos_dir);
 		VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
 
 		txg = dmu_tx_get_txg(tx);
 		dp->dp_scan->scn_restart_txg = txg;
 		dmu_tx_commit(tx);
 	} else {
 		dp->dp_scan->scn_restart_txg = txg;
 	}
 	zfs_dbgmsg("restarting resilver for %s at txg=%llu",
 	    dp->dp_spa->spa_name, (longlong_t)txg);
 }
 
 void
 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
 {
 	zio_free(dp->dp_spa, txg, bp);
 }
 
 void
 dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
 {
 	ASSERT(dsl_pool_sync_context(dp));
 	zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
 }
 
 static int
 scan_ds_queue_compare(const void *a, const void *b)
 {
 	const scan_ds_t *sds_a = a, *sds_b = b;
 
 	if (sds_a->sds_dsobj < sds_b->sds_dsobj)
 		return (-1);
 	if (sds_a->sds_dsobj == sds_b->sds_dsobj)
 		return (0);
 	return (1);
 }
 
 static void
 scan_ds_queue_clear(dsl_scan_t *scn)
 {
 	void *cookie = NULL;
 	scan_ds_t *sds;
 	while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) {
 		kmem_free(sds, sizeof (*sds));
 	}
 }
 
 static boolean_t
 scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg)
 {
 	scan_ds_t srch, *sds;
 
 	srch.sds_dsobj = dsobj;
 	sds = avl_find(&scn->scn_queue, &srch, NULL);
 	if (sds != NULL && txg != NULL)
 		*txg = sds->sds_txg;
 	return (sds != NULL);
 }
 
 static void
 scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg)
 {
 	scan_ds_t *sds;
 	avl_index_t where;
 
 	sds = kmem_zalloc(sizeof (*sds), KM_SLEEP);
 	sds->sds_dsobj = dsobj;
 	sds->sds_txg = txg;
 
 	VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL);
 	avl_insert(&scn->scn_queue, sds, where);
 }
 
 static void
 scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj)
 {
 	scan_ds_t srch, *sds;
 
 	srch.sds_dsobj = dsobj;
 
 	sds = avl_find(&scn->scn_queue, &srch, NULL);
 	VERIFY(sds != NULL);
 	avl_remove(&scn->scn_queue, sds);
 	kmem_free(sds, sizeof (*sds));
 }
 
 static void
 scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 	dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ?
 	    DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER;
 
 	ASSERT0(scn->scn_queues_pending);
 	ASSERT(scn->scn_phys.scn_queue_obj != 0);
 
 	VERIFY0(dmu_object_free(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, tx));
 	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot,
 	    DMU_OT_NONE, 0, tx);
 	for (scan_ds_t *sds = avl_first(&scn->scn_queue);
 	    sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) {
 		VERIFY0(zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, sds->sds_dsobj,
 		    sds->sds_txg, tx));
 	}
 }
 
 /*
  * Computes the memory limit state that we're currently in. A sorted scan
  * needs quite a bit of memory to hold the sorting queue, so we need to
  * reasonably constrain the size so it doesn't impact overall system
  * performance. We compute two limits:
  * 1) Hard memory limit: if the amount of memory used by the sorting
  *	queues on a pool gets above this value, we stop the metadata
  *	scanning portion and start issuing the queued up and sorted
  *	I/Os to reduce memory usage.
  *	This limit is calculated as a fraction of physmem (by default 5%).
  *	We constrain the lower bound of the hard limit to an absolute
  *	minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain
  *	the upper bound to 5% of the total pool size - no chance we'll
  *	ever need that much memory, but just to keep the value in check.
  * 2) Soft memory limit: once we hit the hard memory limit, we start
  *	issuing I/O to reduce queue memory usage, but we don't want to
  *	completely empty out the queues, since we might be able to find I/Os
  *	that will fill in the gaps of our non-sequential IOs at some point
  *	in the future. So we stop the issuing of I/Os once the amount of
  *	memory used drops below the soft limit (at which point we stop issuing
  *	I/O and start scanning metadata again).
  *
  *	This limit is calculated by subtracting a fraction of the hard
  *	limit from the hard limit. By default this fraction is 5%, so
  *	the soft limit is 95% of the hard limit. We cap the size of the
  *	difference between the hard and soft limits at an absolute
  *	maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is
  *	sufficient to not cause too frequent switching between the
  *	metadata scan and I/O issue (even at 2k recordsize, 128 MiB's
  *	worth of queues is about 1.2 GiB of on-pool data, so scanning
  *	that should take at least a decent fraction of a second).
  */
 static boolean_t
 dsl_scan_should_clear(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
 	uint64_t alloc, mlim_hard, mlim_soft, mused;
 
 	alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 	alloc += metaslab_class_get_alloc(spa_special_class(spa));
 	alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
 
 	mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE,
 	    zfs_scan_mem_lim_min);
 	mlim_hard = MIN(mlim_hard, alloc / 20);
 	mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact,
 	    zfs_scan_mem_lim_soft_max);
 	mused = 0;
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *tvd = rvd->vdev_child[i];
 		dsl_scan_io_queue_t *queue;
 
 		mutex_enter(&tvd->vdev_scan_io_queue_lock);
 		queue = tvd->vdev_scan_io_queue;
 		if (queue != NULL) {
 			/*
 			 * # of extents in exts_by_addr = # in exts_by_size.
 			 * B-tree efficiency is ~75%, but can be as low as 50%.
 			 */
-			mused += zfs_btree_numnodes(&queue->q_exts_by_size) *
-			    ((sizeof (range_seg_gap_t) + sizeof (uint64_t)) *
+			mused += zfs_btree_numnodes(&queue->q_exts_by_size) * ((
+			    sizeof (zfs_range_seg_gap_t) + sizeof (uint64_t)) *
 			    3 / 2) + queue->q_sio_memused;
 		}
 		mutex_exit(&tvd->vdev_scan_io_queue_lock);
 	}
 
 	dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused);
 
 	if (mused == 0)
 		ASSERT0(scn->scn_queues_pending);
 
 	/*
 	 * If we are above our hard limit, we need to clear out memory.
 	 * If we are below our soft limit, we need to accumulate sequential IOs.
 	 * Otherwise, we should keep doing whatever we are currently doing.
 	 */
 	if (mused >= mlim_hard)
 		return (B_TRUE);
 	else if (mused < mlim_soft)
 		return (B_FALSE);
 	else
 		return (scn->scn_clearing);
 }
 
 static boolean_t
 dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 {
 	/* we never skip user/group accounting objects */
 	if (zb && (int64_t)zb->zb_object < 0)
 		return (B_FALSE);
 
 	if (scn->scn_suspending)
 		return (B_TRUE); /* we're already suspending */
 
 	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
 		return (B_FALSE); /* we're resuming */
 
 	/* We only know how to resume from level-0 and objset blocks. */
 	if (zb && (zb->zb_level != 0 && zb->zb_level != ZB_ROOT_LEVEL))
 		return (B_FALSE);
 
 	/*
 	 * We suspend if:
 	 *  - we have scanned for at least the minimum time (default 1 sec
 	 *    for scrub, 3 sec for resilver), and either we have sufficient
 	 *    dirty data that we are starting to write more quickly
 	 *    (default 30%), someone is explicitly waiting for this txg
 	 *    to complete, or we have used up all of the time in the txg
 	 *    timeout (default 5 sec).
 	 *  or
 	 *  - the spa is shutting down because this pool is being exported
 	 *    or the machine is rebooting.
 	 *  or
 	 *  - the scan queue has reached its memory use limit
 	 */
 	uint64_t curr_time_ns = gethrtime();
 	uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
 	uint64_t sync_time_ns = curr_time_ns -
 	    scn->scn_dp->dp_spa->spa_sync_starttime;
 	uint64_t dirty_min_bytes = zfs_dirty_data_max *
 	    zfs_vdev_async_write_active_min_dirty_percent / 100;
 	uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
 	    zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
 
 	if ((NSEC2MSEC(scan_time_ns) > mintime &&
 	    (scn->scn_dp->dp_dirty_total >= dirty_min_bytes ||
 	    txg_sync_waiting(scn->scn_dp) ||
 	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa) ||
 	    (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn)) ||
 	    !ddt_walk_ready(scn->scn_dp->dp_spa)) {
 		if (zb && zb->zb_level == ZB_ROOT_LEVEL) {
 			dprintf("suspending at first available bookmark "
 			    "%llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
 			    (longlong_t)zb->zb_object,
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
 			SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
 			    zb->zb_objset, 0, 0, 0);
 		} else if (zb != NULL) {
 			dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
 			    (longlong_t)zb->zb_object,
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
 			scn->scn_phys.scn_bookmark = *zb;
 		} else {
 #ifdef ZFS_DEBUG
 			dsl_scan_phys_t *scnp = &scn->scn_phys;
 			dprintf("suspending at at DDT bookmark "
 			    "%llx/%llx/%llx/%llx\n",
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
 #endif
 		}
 		scn->scn_suspending = B_TRUE;
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static boolean_t
 dsl_error_scrub_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 {
 	/*
 	 * We suspend if:
 	 *  - we have scrubbed for at least the minimum time (default 1 sec
 	 *    for error scrub), someone is explicitly waiting for this txg
 	 *    to complete, or we have used up all of the time in the txg
 	 *    timeout (default 5 sec).
 	 *  or
 	 *  - the spa is shutting down because this pool is being exported
 	 *    or the machine is rebooting.
 	 */
 	uint64_t curr_time_ns = gethrtime();
 	uint64_t error_scrub_time_ns = curr_time_ns - scn->scn_sync_start_time;
 	uint64_t sync_time_ns = curr_time_ns -
 	    scn->scn_dp->dp_spa->spa_sync_starttime;
 	int mintime = zfs_scrub_min_time_ms;
 
 	if ((NSEC2MSEC(error_scrub_time_ns) > mintime &&
 	    (txg_sync_waiting(scn->scn_dp) ||
 	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa)) {
 		if (zb) {
 			dprintf("error scrub suspending at bookmark "
 			    "%llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
 			    (longlong_t)zb->zb_object,
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
 		}
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 typedef struct zil_scan_arg {
 	dsl_pool_t	*zsa_dp;
 	zil_header_t	*zsa_zh;
 } zil_scan_arg_t;
 
 static int
 dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
     uint64_t claim_txg)
 {
 	(void) zilog;
 	zil_scan_arg_t *zsa = arg;
 	dsl_pool_t *dp = zsa->zsa_dp;
 	dsl_scan_t *scn = dp->dp_scan;
 	zil_header_t *zh = zsa->zsa_zh;
 	zbookmark_phys_t zb;
 
 	ASSERT(!BP_IS_REDACTED(bp));
 	if (BP_IS_HOLE(bp) ||
 	    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
 		return (0);
 
 	/*
 	 * One block ("stubby") can be allocated a long time ago; we
 	 * want to visit that one because it has been allocated
 	 * (on-disk) even if it hasn't been claimed (even though for
 	 * scrub there's nothing to do to it).
 	 */
 	if (claim_txg == 0 &&
 	    BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa))
 		return (0);
 
 	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
 	return (0);
 }
 
 static int
 dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
     uint64_t claim_txg)
 {
 	(void) zilog;
 	if (lrc->lrc_txtype == TX_WRITE) {
 		zil_scan_arg_t *zsa = arg;
 		dsl_pool_t *dp = zsa->zsa_dp;
 		dsl_scan_t *scn = dp->dp_scan;
 		zil_header_t *zh = zsa->zsa_zh;
 		const lr_write_t *lr = (const lr_write_t *)lrc;
 		const blkptr_t *bp = &lr->lr_blkptr;
 		zbookmark_phys_t zb;
 
 		ASSERT(!BP_IS_REDACTED(bp));
 		if (BP_IS_HOLE(bp) ||
 		    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
 			return (0);
 
 		/*
 		 * birth can be < claim_txg if this record's txg is
 		 * already txg sync'ed (but this log block contains
 		 * other records that are not synced)
 		 */
 		if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
 			return (0);
 
 		ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
 		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 		    lr->lr_foid, ZB_ZIL_LEVEL,
 		    lr->lr_offset / BP_GET_LSIZE(bp));
 
 		VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
 	}
 	return (0);
 }
 
 static void
 dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
 {
 	uint64_t claim_txg = zh->zh_claim_txg;
 	zil_scan_arg_t zsa = { dp, zh };
 	zilog_t *zilog;
 
 	ASSERT(spa_writeable(dp->dp_spa));
 
 	/*
 	 * We only want to visit blocks that have been claimed but not yet
 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
 	 */
 	if (claim_txg == 0)
 		return;
 
 	zilog = zil_alloc(dp->dp_meta_objset, zh);
 
 	(void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
 	    claim_txg, B_FALSE);
 
 	zil_free(zilog);
 }
 
 /*
  * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea
  * here is to sort the AVL tree by the order each block will be needed.
  */
 static int
 scan_prefetch_queue_compare(const void *a, const void *b)
 {
 	const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b;
 	const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc;
 	const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc;
 
 	return (zbookmark_compare(spc_a->spc_datablkszsec,
 	    spc_a->spc_indblkshift, spc_b->spc_datablkszsec,
 	    spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb));
 }
 
 static void
 scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, const void *tag)
 {
 	if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) {
 		zfs_refcount_destroy(&spc->spc_refcnt);
 		kmem_free(spc, sizeof (scan_prefetch_ctx_t));
 	}
 }
 
 static scan_prefetch_ctx_t *
 scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, const void *tag)
 {
 	scan_prefetch_ctx_t *spc;
 
 	spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP);
 	zfs_refcount_create(&spc->spc_refcnt);
 	zfs_refcount_add(&spc->spc_refcnt, tag);
 	spc->spc_scn = scn;
 	if (dnp != NULL) {
 		spc->spc_datablkszsec = dnp->dn_datablkszsec;
 		spc->spc_indblkshift = dnp->dn_indblkshift;
 		spc->spc_root = B_FALSE;
 	} else {
 		spc->spc_datablkszsec = 0;
 		spc->spc_indblkshift = 0;
 		spc->spc_root = B_TRUE;
 	}
 
 	return (spc);
 }
 
 static void
 scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, const void *tag)
 {
 	zfs_refcount_add(&spc->spc_refcnt, tag);
 }
 
 static void
 scan_ds_prefetch_queue_clear(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	void *cookie = NULL;
 	scan_prefetch_issue_ctx_t *spic = NULL;
 
 	mutex_enter(&spa->spa_scrub_lock);
 	while ((spic = avl_destroy_nodes(&scn->scn_prefetch_queue,
 	    &cookie)) != NULL) {
 		scan_prefetch_ctx_rele(spic->spic_spc, scn);
 		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
 	}
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 static boolean_t
 dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc,
     const zbookmark_phys_t *zb)
 {
 	zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark;
 	dnode_phys_t tmp_dnp;
 	dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp;
 
 	if (zb->zb_objset != last_zb->zb_objset)
 		return (B_TRUE);
 	if ((int64_t)zb->zb_object < 0)
 		return (B_FALSE);
 
 	tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec;
 	tmp_dnp.dn_indblkshift = spc->spc_indblkshift;
 
 	if (zbookmark_subtree_completed(dnp, zb, last_zb))
 		return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 static void
 dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
 {
 	avl_index_t idx;
 	dsl_scan_t *scn = spc->spc_scn;
 	spa_t *spa = scn->scn_dp->dp_spa;
 	scan_prefetch_issue_ctx_t *spic;
 
 	if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp))
 		return;
 
 	if (BP_IS_HOLE(bp) ||
 	    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg ||
 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
 	    BP_GET_TYPE(bp) != DMU_OT_OBJSET))
 		return;
 
 	if (dsl_scan_check_prefetch_resume(spc, zb))
 		return;
 
 	scan_prefetch_ctx_add_ref(spc, scn);
 	spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP);
 	spic->spic_spc = spc;
 	spic->spic_bp = *bp;
 	spic->spic_zb = *zb;
 
 	/*
 	 * Add the IO to the queue of blocks to prefetch. This allows us to
 	 * prioritize blocks that we will need first for the main traversal
 	 * thread.
 	 */
 	mutex_enter(&spa->spa_scrub_lock);
 	if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) {
 		/* this block is already queued for prefetch */
 		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
 		scan_prefetch_ctx_rele(spc, scn);
 		mutex_exit(&spa->spa_scrub_lock);
 		return;
 	}
 
 	avl_insert(&scn->scn_prefetch_queue, spic, idx);
 	cv_broadcast(&spa->spa_scrub_io_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 static void
 dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp,
     uint64_t objset, uint64_t object)
 {
 	int i;
 	zbookmark_phys_t zb;
 	scan_prefetch_ctx_t *spc;
 
 	if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
 		return;
 
 	SET_BOOKMARK(&zb, objset, object, 0, 0);
 
 	spc = scan_prefetch_ctx_create(scn, dnp, FTAG);
 
 	for (i = 0; i < dnp->dn_nblkptr; i++) {
 		zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]);
 		zb.zb_blkid = i;
 		dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb);
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		zb.zb_level = 0;
 		zb.zb_blkid = DMU_SPILL_BLKID;
 		dsl_scan_prefetch(spc, DN_SPILL_BLKPTR(dnp), &zb);
 	}
 
 	scan_prefetch_ctx_rele(spc, FTAG);
 }
 
 static void
 dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *private)
 {
 	(void) zio;
 	scan_prefetch_ctx_t *spc = private;
 	dsl_scan_t *scn = spc->spc_scn;
 	spa_t *spa = scn->scn_dp->dp_spa;
 
 	/* broadcast that the IO has completed for rate limiting purposes */
 	mutex_enter(&spa->spa_scrub_lock);
 	ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
 	spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
 	cv_broadcast(&spa->spa_scrub_io_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 
 	/* if there was an error or we are done prefetching, just cleanup */
 	if (buf == NULL || scn->scn_prefetch_stop)
 		goto out;
 
 	if (BP_GET_LEVEL(bp) > 0) {
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		zbookmark_phys_t czb;
 
 		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1, zb->zb_blkid * epb + i);
 			dsl_scan_prefetch(spc, cbp, &czb);
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		dnode_phys_t *cdnp;
 		int i;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 
 		for (i = 0, cdnp = buf->b_data; i < epb;
 		    i += cdnp->dn_extra_slots + 1,
 		    cdnp += cdnp->dn_extra_slots + 1) {
 			dsl_scan_prefetch_dnode(scn, cdnp,
 			    zb->zb_objset, zb->zb_blkid * epb + i);
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		objset_phys_t *osp = buf->b_data;
 
 		dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode,
 		    zb->zb_objset, DMU_META_DNODE_OBJECT);
 
 		if (OBJSET_BUF_HAS_USERUSED(buf)) {
 			if (OBJSET_BUF_HAS_PROJECTUSED(buf)) {
 				dsl_scan_prefetch_dnode(scn,
 				    &osp->os_projectused_dnode, zb->zb_objset,
 				    DMU_PROJECTUSED_OBJECT);
 			}
 			dsl_scan_prefetch_dnode(scn,
 			    &osp->os_groupused_dnode, zb->zb_objset,
 			    DMU_GROUPUSED_OBJECT);
 			dsl_scan_prefetch_dnode(scn,
 			    &osp->os_userused_dnode, zb->zb_objset,
 			    DMU_USERUSED_OBJECT);
 		}
 	}
 
 out:
 	if (buf != NULL)
 		arc_buf_destroy(buf, private);
 	scan_prefetch_ctx_rele(spc, scn);
 }
 
 static void
 dsl_scan_prefetch_thread(void *arg)
 {
 	dsl_scan_t *scn = arg;
 	spa_t *spa = scn->scn_dp->dp_spa;
 	scan_prefetch_issue_ctx_t *spic;
 
 	/* loop until we are told to stop */
 	while (!scn->scn_prefetch_stop) {
 		arc_flags_t flags = ARC_FLAG_NOWAIT |
 		    ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH;
 		int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
 
 		mutex_enter(&spa->spa_scrub_lock);
 
 		/*
 		 * Wait until we have an IO to issue and are not above our
 		 * maximum in flight limit.
 		 */
 		while (!scn->scn_prefetch_stop &&
 		    (avl_numnodes(&scn->scn_prefetch_queue) == 0 ||
 		    spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) {
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 		}
 
 		/* recheck if we should stop since we waited for the cv */
 		if (scn->scn_prefetch_stop) {
 			mutex_exit(&spa->spa_scrub_lock);
 			break;
 		}
 
 		/* remove the prefetch IO from the tree */
 		spic = avl_first(&scn->scn_prefetch_queue);
 		spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp);
 		avl_remove(&scn->scn_prefetch_queue, spic);
 
 		mutex_exit(&spa->spa_scrub_lock);
 
 		if (BP_IS_PROTECTED(&spic->spic_bp)) {
 			ASSERT(BP_GET_TYPE(&spic->spic_bp) == DMU_OT_DNODE ||
 			    BP_GET_TYPE(&spic->spic_bp) == DMU_OT_OBJSET);
 			ASSERT3U(BP_GET_LEVEL(&spic->spic_bp), ==, 0);
 			zio_flags |= ZIO_FLAG_RAW;
 		}
 
 		/* We don't need data L1 buffer since we do not prefetch L0. */
 		blkptr_t *bp = &spic->spic_bp;
 		if (BP_GET_LEVEL(bp) == 1 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
 		    BP_GET_TYPE(bp) != DMU_OT_OBJSET)
 			flags |= ARC_FLAG_NO_BUF;
 
 		/* issue the prefetch asynchronously */
 		(void) arc_read(scn->scn_zio_root, spa, bp,
 		    dsl_scan_prefetch_cb, spic->spic_spc, ZIO_PRIORITY_SCRUB,
 		    zio_flags, &flags, &spic->spic_zb);
 
 		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
 	}
 
 	ASSERT(scn->scn_prefetch_stop);
 
 	/* free any prefetches we didn't get to complete */
 	mutex_enter(&spa->spa_scrub_lock);
 	while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) {
 		avl_remove(&scn->scn_prefetch_queue, spic);
 		scan_prefetch_ctx_rele(spic->spic_spc, scn);
 		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
 	}
 	ASSERT0(avl_numnodes(&scn->scn_prefetch_queue));
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 static boolean_t
 dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
     const zbookmark_phys_t *zb)
 {
 	/*
 	 * We never skip over user/group accounting objects (obj<0)
 	 */
 	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
 	    (int64_t)zb->zb_object >= 0) {
 		/*
 		 * If we already visited this bp & everything below (in
 		 * a prior txg sync), don't bother doing it again.
 		 */
 		if (zbookmark_subtree_completed(dnp, zb,
 		    &scn->scn_phys.scn_bookmark))
 			return (B_TRUE);
 
 		/*
 		 * If we found the block we're trying to resume from, or
 		 * we went past it, zero it out to indicate that it's OK
 		 * to start checking for suspending again.
 		 */
 		if (zbookmark_subtree_tbd(dnp, zb,
 		    &scn->scn_phys.scn_bookmark)) {
 			dprintf("resuming at %llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
 			    (longlong_t)zb->zb_object,
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
 			memset(&scn->scn_phys.scn_bookmark, 0, sizeof (*zb));
 		}
 	}
 	return (B_FALSE);
 }
 
 static void dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
     dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
     dmu_objset_type_t ostype, dmu_tx_t *tx);
 inline __attribute__((always_inline)) static void dsl_scan_visitdnode(
     dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
     dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
 
 /*
  * Return nonzero on i/o error.
  * Return new buf to write out in *bufp.
  */
 inline __attribute__((always_inline)) static int
 dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
     dnode_phys_t *dnp, const blkptr_t *bp,
     const zbookmark_phys_t *zb, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
 	int err;
 
 	ASSERT(!BP_IS_REDACTED(bp));
 
 	/*
 	 * There is an unlikely case of encountering dnodes with contradicting
 	 * dn_bonuslen and DNODE_FLAG_SPILL_BLKPTR flag before in files created
 	 * or modified before commit 4254acb was merged. As it is not possible
 	 * to know which of the two is correct, report an error.
 	 */
 	if (dnp != NULL &&
 	    dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) {
 		scn->scn_phys.scn_errors++;
 		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (BP_GET_LEVEL(bp) > 0) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		arc_buf_t *buf;
 
 		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
 		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
 			zbookmark_phys_t czb;
 
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			dsl_scan_visitbp(cbp, &czb, dnp,
 			    ds, scn, ostype, tx);
 		}
 		arc_buf_destroy(buf, &buf);
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		dnode_phys_t *cdnp;
 		int i;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 		arc_buf_t *buf;
 
 		if (BP_IS_PROTECTED(bp)) {
 			ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 			zio_flags |= ZIO_FLAG_RAW;
 		}
 
 		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
 		for (i = 0, cdnp = buf->b_data; i < epb;
 		    i += cdnp->dn_extra_slots + 1,
 		    cdnp += cdnp->dn_extra_slots + 1) {
 			dsl_scan_visitdnode(scn, ds, ostype,
 			    cdnp, zb->zb_blkid * epb + i, tx);
 		}
 
 		arc_buf_destroy(buf, &buf);
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		objset_phys_t *osp;
 		arc_buf_t *buf;
 
 		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
 
 		osp = buf->b_data;
 
 		dsl_scan_visitdnode(scn, ds, osp->os_type,
 		    &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
 
 		if (OBJSET_BUF_HAS_USERUSED(buf)) {
 			/*
 			 * We also always visit user/group/project accounting
 			 * objects, and never skip them, even if we are
 			 * suspending. This is necessary so that the
 			 * space deltas from this txg get integrated.
 			 */
 			if (OBJSET_BUF_HAS_PROJECTUSED(buf))
 				dsl_scan_visitdnode(scn, ds, osp->os_type,
 				    &osp->os_projectused_dnode,
 				    DMU_PROJECTUSED_OBJECT, tx);
 			dsl_scan_visitdnode(scn, ds, osp->os_type,
 			    &osp->os_groupused_dnode,
 			    DMU_GROUPUSED_OBJECT, tx);
 			dsl_scan_visitdnode(scn, ds, osp->os_type,
 			    &osp->os_userused_dnode,
 			    DMU_USERUSED_OBJECT, tx);
 		}
 		arc_buf_destroy(buf, &buf);
 	} else if (!zfs_blkptr_verify(spa, bp,
 	    BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
 		/*
 		 * Sanity check the block pointer contents, this is handled
 		 * by arc_read() for the cases above.
 		 */
 		scn->scn_phys.scn_errors++;
 		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
 		return (SET_ERROR(EINVAL));
 	}
 
 	return (0);
 }
 
 inline __attribute__((always_inline)) static void
 dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
     dmu_objset_type_t ostype, dnode_phys_t *dnp,
     uint64_t object, dmu_tx_t *tx)
 {
 	int j;
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		zbookmark_phys_t czb;
 
 		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
 		    dnp->dn_nlevels - 1, j);
 		dsl_scan_visitbp(&dnp->dn_blkptr[j],
 		    &czb, dnp, ds, scn, ostype, tx);
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		zbookmark_phys_t czb;
 		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
 		    0, DMU_SPILL_BLKID);
 		dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp),
 		    &czb, dnp, ds, scn, ostype, tx);
 	}
 }
 
 /*
  * The arguments are in this order because mdb can only print the
  * first 5; we want them to be useful.
  */
 static void
 dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
     dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
     dmu_objset_type_t ostype, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 
 	if (dsl_scan_check_suspend(scn, zb))
 		return;
 
 	if (dsl_scan_check_resume(scn, dnp, zb))
 		return;
 
 	scn->scn_visited_this_txg++;
 
 	if (BP_IS_HOLE(bp)) {
 		scn->scn_holes_this_txg++;
 		return;
 	}
 
 	if (BP_IS_REDACTED(bp)) {
 		ASSERT(dsl_dataset_feature_is_active(ds,
 		    SPA_FEATURE_REDACTED_DATASETS));
 		return;
 	}
 
 	/*
 	 * Check if this block contradicts any filesystem flags.
 	 */
 	spa_feature_t f = SPA_FEATURE_LARGE_BLOCKS;
 	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
 		ASSERT(dsl_dataset_feature_is_active(ds, f));
 
 	f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
 	if (f != SPA_FEATURE_NONE)
 		ASSERT(dsl_dataset_feature_is_active(ds, f));
 
 	f = zio_compress_to_feature(BP_GET_COMPRESS(bp));
 	if (f != SPA_FEATURE_NONE)
 		ASSERT(dsl_dataset_feature_is_active(ds, f));
 
 	if (BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) {
 		scn->scn_lt_min_this_txg++;
 		return;
 	}
 
 	if (dsl_scan_recurse(scn, ds, ostype, dnp, bp, zb, tx) != 0)
 		return;
 
 	/*
 	 * If dsl_scan_ddt() has already visited this block, it will have
 	 * already done any translations or scrubbing, so don't call the
 	 * callback again.
 	 */
 	if (ddt_class_contains(dp->dp_spa,
 	    scn->scn_phys.scn_ddt_class_max, bp)) {
 		scn->scn_ddt_contained_this_txg++;
 		return;
 	}
 
 	/*
 	 * If this block is from the future (after cur_max_txg), then we
 	 * are doing this on behalf of a deleted snapshot, and we will
 	 * revisit the future block on the next pass of this dataset.
 	 * Don't scan it now unless we need to because something
 	 * under it was modified.
 	 */
 	if (BP_GET_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
 		scn->scn_gt_max_this_txg++;
 		return;
 	}
 
 	scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
 }
 
 static void
 dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
     dmu_tx_t *tx)
 {
 	zbookmark_phys_t zb;
 	scan_prefetch_ctx_t *spc;
 
 	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
 	if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) {
 		SET_BOOKMARK(&scn->scn_prefetch_bookmark,
 		    zb.zb_objset, 0, 0, 0);
 	} else {
 		scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark;
 	}
 
 	scn->scn_objsets_visited_this_txg++;
 
 	spc = scan_prefetch_ctx_create(scn, NULL, FTAG);
 	dsl_scan_prefetch(spc, bp, &zb);
 	scan_prefetch_ctx_rele(spc, FTAG);
 
 	dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx);
 
 	dprintf_ds(ds, "finished scan%s", "");
 }
 
 static void
 ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys)
 {
 	if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) {
 		if (ds->ds_is_snapshot) {
 			/*
 			 * Note:
 			 *  - scn_cur_{min,max}_txg stays the same.
 			 *  - Setting the flag is not really necessary if
 			 *    scn_cur_max_txg == scn_max_txg, because there
 			 *    is nothing after this snapshot that we care
 			 *    about.  However, we set it anyway and then
 			 *    ignore it when we retraverse it in
 			 *    dsl_scan_visitds().
 			 */
 			scn_phys->scn_bookmark.zb_objset =
 			    dsl_dataset_phys(ds)->ds_next_snap_obj;
 			zfs_dbgmsg("destroying ds %llu on %s; currently "
 			    "traversing; reset zb_objset to %llu",
 			    (u_longlong_t)ds->ds_object,
 			    ds->ds_dir->dd_pool->dp_spa->spa_name,
 			    (u_longlong_t)dsl_dataset_phys(ds)->
 			    ds_next_snap_obj);
 			scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN;
 		} else {
 			SET_BOOKMARK(&scn_phys->scn_bookmark,
 			    ZB_DESTROYED_OBJSET, 0, 0, 0);
 			zfs_dbgmsg("destroying ds %llu on %s; currently "
 			    "traversing; reset bookmark to -1,0,0,0",
 			    (u_longlong_t)ds->ds_object,
 			    ds->ds_dir->dd_pool->dp_spa->spa_name);
 		}
 	}
 }
 
 /*
  * Invoked when a dataset is destroyed. We need to make sure that:
  *
  * 1) If it is the dataset that was currently being scanned, we write
  *	a new dsl_scan_phys_t and marking the objset reference in it
  *	as destroyed.
  * 2) Remove it from the work queue, if it was present.
  *
  * If the dataset was actually a snapshot, instead of marking the dataset
  * as destroyed, we instead substitute the next snapshot in line.
  */
 void
 dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	uint64_t mintxg;
 
 	if (!dsl_scan_is_running(scn))
 		return;
 
 	ds_destroyed_scn_phys(ds, &scn->scn_phys);
 	ds_destroyed_scn_phys(ds, &scn->scn_phys_cached);
 
 	if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
 		scan_ds_queue_remove(scn, ds->ds_object);
 		if (ds->ds_is_snapshot)
 			scan_ds_queue_insert(scn,
 			    dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg);
 	}
 
 	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
 	    ds->ds_object, &mintxg) == 0) {
 		ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
 		if (ds->ds_is_snapshot) {
 			/*
 			 * We keep the same mintxg; it could be >
 			 * ds_creation_txg if the previous snapshot was
 			 * deleted too.
 			 */
 			VERIFY(zap_add_int_key(dp->dp_meta_objset,
 			    scn->scn_phys.scn_queue_obj,
 			    dsl_dataset_phys(ds)->ds_next_snap_obj,
 			    mintxg, tx) == 0);
 			zfs_dbgmsg("destroying ds %llu on %s; in queue; "
 			    "replacing with %llu",
 			    (u_longlong_t)ds->ds_object,
 			    dp->dp_spa->spa_name,
 			    (u_longlong_t)dsl_dataset_phys(ds)->
 			    ds_next_snap_obj);
 		} else {
 			zfs_dbgmsg("destroying ds %llu on %s; in queue; "
 			    "removing",
 			    (u_longlong_t)ds->ds_object,
 			    dp->dp_spa->spa_name);
 		}
 	}
 
 	/*
 	 * dsl_scan_sync() should be called after this, and should sync
 	 * out our changed state, but just to be safe, do it here.
 	 */
 	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 }
 
 static void
 ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark)
 {
 	if (scn_bookmark->zb_objset == ds->ds_object) {
 		scn_bookmark->zb_objset =
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		zfs_dbgmsg("snapshotting ds %llu on %s; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds->ds_object,
 		    ds->ds_dir->dd_pool->dp_spa->spa_name,
 		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
 	}
 }
 
 /*
  * Called when a dataset is snapshotted. If we were currently traversing
  * this snapshot, we reset our bookmark to point at the newly created
  * snapshot. We also modify our work queue to remove the old snapshot and
  * replace with the new one.
  */
 void
 dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	uint64_t mintxg;
 
 	if (!dsl_scan_is_running(scn))
 		return;
 
 	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
 
 	ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark);
 	ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark);
 
 	if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
 		scan_ds_queue_remove(scn, ds->ds_object);
 		scan_ds_queue_insert(scn,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg);
 	}
 
 	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
 	    ds->ds_object, &mintxg) == 0) {
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
 		VERIFY(zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
 		zfs_dbgmsg("snapshotting ds %llu on %s; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds->ds_object,
 		    dp->dp_spa->spa_name,
 		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
 	}
 
 	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 }
 
 static void
 ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2,
     zbookmark_phys_t *scn_bookmark)
 {
 	if (scn_bookmark->zb_objset == ds1->ds_object) {
 		scn_bookmark->zb_objset = ds2->ds_object;
 		zfs_dbgmsg("clone_swap ds %llu on %s; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds1->ds_object,
 		    ds1->ds_dir->dd_pool->dp_spa->spa_name,
 		    (u_longlong_t)ds2->ds_object);
 	} else if (scn_bookmark->zb_objset == ds2->ds_object) {
 		scn_bookmark->zb_objset = ds1->ds_object;
 		zfs_dbgmsg("clone_swap ds %llu on %s; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds2->ds_object,
 		    ds2->ds_dir->dd_pool->dp_spa->spa_name,
 		    (u_longlong_t)ds1->ds_object);
 	}
 }
 
 /*
  * Called when an origin dataset and its clone are swapped.  If we were
  * currently traversing the dataset, we need to switch to traversing the
  * newly promoted clone.
  */
 void
 dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	uint64_t mintxg1, mintxg2;
 	boolean_t ds1_queued, ds2_queued;
 
 	if (!dsl_scan_is_running(scn))
 		return;
 
 	ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark);
 	ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark);
 
 	/*
 	 * Handle the in-memory scan queue.
 	 */
 	ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1);
 	ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2);
 
 	/* Sanity checking. */
 	if (ds1_queued) {
 		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
 		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 	}
 	if (ds2_queued) {
 		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
 		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 	}
 
 	if (ds1_queued && ds2_queued) {
 		/*
 		 * If both are queued, we don't need to do anything.
 		 * The swapping code below would not handle this case correctly,
 		 * since we can't insert ds2 if it is already there. That's
 		 * because scan_ds_queue_insert() prohibits a duplicate insert
 		 * and panics.
 		 */
 	} else if (ds1_queued) {
 		scan_ds_queue_remove(scn, ds1->ds_object);
 		scan_ds_queue_insert(scn, ds2->ds_object, mintxg1);
 	} else if (ds2_queued) {
 		scan_ds_queue_remove(scn, ds2->ds_object);
 		scan_ds_queue_insert(scn, ds1->ds_object, mintxg2);
 	}
 
 	/*
 	 * Handle the on-disk scan queue.
 	 * The on-disk state is an out-of-date version of the in-memory state,
 	 * so the in-memory and on-disk values for ds1_queued and ds2_queued may
 	 * be different. Therefore we need to apply the swap logic to the
 	 * on-disk state independently of the in-memory state.
 	 */
 	ds1_queued = zap_lookup_int_key(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0;
 	ds2_queued = zap_lookup_int_key(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0;
 
 	/* Sanity checking. */
 	if (ds1_queued) {
 		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
 		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 	}
 	if (ds2_queued) {
 		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
 		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 	}
 
 	if (ds1_queued && ds2_queued) {
 		/*
 		 * If both are queued, we don't need to do anything.
 		 * Alternatively, we could check for EEXIST from
 		 * zap_add_int_key() and back out to the original state, but
 		 * that would be more work than checking for this case upfront.
 		 */
 	} else if (ds1_queued) {
 		VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
 		VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx));
 		zfs_dbgmsg("clone_swap ds %llu on %s; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds1->ds_object,
 		    dp->dp_spa->spa_name,
 		    (u_longlong_t)ds2->ds_object);
 	} else if (ds2_queued) {
 		VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
 		VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx));
 		zfs_dbgmsg("clone_swap ds %llu on %s; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds2->ds_object,
 		    dp->dp_spa->spa_name,
 		    (u_longlong_t)ds1->ds_object);
 	}
 
 	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 }
 
 static int
 enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
 	uint64_t originobj = *(uint64_t *)arg;
 	dsl_dataset_t *ds;
 	int err;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj)
 		return (0);
 
 	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 	if (err)
 		return (err);
 
 	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) {
 		dsl_dataset_t *prev;
 		err = dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 
 		dsl_dataset_rele(ds, FTAG);
 		if (err)
 			return (err);
 		ds = prev;
 	}
 	mutex_enter(&scn->scn_queue_lock);
 	scan_ds_queue_insert(scn, ds->ds_object,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 	mutex_exit(&scn->scn_queue_lock);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 static void
 dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	dsl_dataset_t *ds;
 
 	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 
 	if (scn->scn_phys.scn_cur_min_txg >=
 	    scn->scn_phys.scn_max_txg) {
 		/*
 		 * This can happen if this snapshot was created after the
 		 * scan started, and we already completed a previous snapshot
 		 * that was created after the scan started.  This snapshot
 		 * only references blocks with:
 		 *
 		 *	birth < our ds_creation_txg
 		 *	cur_min_txg is no less than ds_creation_txg.
 		 *	We have already visited these blocks.
 		 * or
 		 *	birth > scn_max_txg
 		 *	The scan requested not to visit these blocks.
 		 *
 		 * Subsequent snapshots (and clones) can reference our
 		 * blocks, or blocks with even higher birth times.
 		 * Therefore we do not need to visit them either,
 		 * so we do not add them to the work queue.
 		 *
 		 * Note that checking for cur_min_txg >= cur_max_txg
 		 * is not sufficient, because in that case we may need to
 		 * visit subsequent snapshots.  This happens when min_txg > 0,
 		 * which raises cur_min_txg.  In this case we will visit
 		 * this dataset but skip all of its blocks, because the
 		 * rootbp's birth time is < cur_min_txg.  Then we will
 		 * add the next snapshots/clones to the work queue.
 		 */
 		char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 		dsl_dataset_name(ds, dsname);
 		zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
 		    "cur_min_txg (%llu) >= max_txg (%llu)",
 		    (longlong_t)dsobj, dsname,
 		    (longlong_t)scn->scn_phys.scn_cur_min_txg,
 		    (longlong_t)scn->scn_phys.scn_max_txg);
 		kmem_free(dsname, MAXNAMELEN);
 
 		goto out;
 	}
 
 	/*
 	 * Only the ZIL in the head (non-snapshot) is valid. Even though
 	 * snapshots can have ZIL block pointers (which may be the same
 	 * BP as in the head), they must be ignored. In addition, $ORIGIN
 	 * doesn't have a objset (i.e. its ds_bp is a hole) so we don't
 	 * need to look for a ZIL in it either. So we traverse the ZIL here,
 	 * rather than in scan_recurse(), because the regular snapshot
 	 * block-sharing rules don't apply to it.
 	 */
 	if (!dsl_dataset_is_snapshot(ds) &&
 	    (dp->dp_origin_snap == NULL ||
 	    ds->ds_dir != dp->dp_origin_snap->ds_dir)) {
 		objset_t *os;
 		if (dmu_objset_from_ds(ds, &os) != 0) {
 			goto out;
 		}
 		dsl_scan_zil(dp, &os->os_zil_header);
 	}
 
 	/*
 	 * Iterate over the bps in this ds.
 	 */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 	char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 	dsl_dataset_name(ds, dsname);
 	zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
 	    "suspending=%u",
 	    (longlong_t)dsobj, dsname,
 	    (longlong_t)scn->scn_phys.scn_cur_min_txg,
 	    (longlong_t)scn->scn_phys.scn_cur_max_txg,
 	    (int)scn->scn_suspending);
 	kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
 
 	if (scn->scn_suspending)
 		goto out;
 
 	/*
 	 * We've finished this pass over this dataset.
 	 */
 
 	/*
 	 * If we did not completely visit this dataset, do another pass.
 	 */
 	if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
 		zfs_dbgmsg("incomplete pass on %s; visiting again",
 		    dp->dp_spa->spa_name);
 		scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
 		scan_ds_queue_insert(scn, ds->ds_object,
 		    scn->scn_phys.scn_cur_max_txg);
 		goto out;
 	}
 
 	/*
 	 * Add descendant datasets to work queue.
 	 */
 	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
 		scan_ds_queue_insert(scn,
 		    dsl_dataset_phys(ds)->ds_next_snap_obj,
 		    dsl_dataset_phys(ds)->ds_creation_txg);
 	}
 	if (dsl_dataset_phys(ds)->ds_num_children > 1) {
 		boolean_t usenext = B_FALSE;
 		if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
 			uint64_t count;
 			/*
 			 * A bug in a previous version of the code could
 			 * cause upgrade_clones_cb() to not set
 			 * ds_next_snap_obj when it should, leading to a
 			 * missing entry.  Therefore we can only use the
 			 * next_clones_obj when its count is correct.
 			 */
 			int err = zap_count(dp->dp_meta_objset,
 			    dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
 			if (err == 0 &&
 			    count == dsl_dataset_phys(ds)->ds_num_children - 1)
 				usenext = B_TRUE;
 		}
 
 		if (usenext) {
 			zap_cursor_t zc;
 			zap_attribute_t *za = zap_attribute_alloc();
 			for (zap_cursor_init(&zc, dp->dp_meta_objset,
 			    dsl_dataset_phys(ds)->ds_next_clones_obj);
 			    zap_cursor_retrieve(&zc, za) == 0;
 			    (void) zap_cursor_advance(&zc)) {
 				scan_ds_queue_insert(scn,
 				    zfs_strtonum(za->za_name, NULL),
 				    dsl_dataset_phys(ds)->ds_creation_txg);
 			}
 			zap_cursor_fini(&zc);
 			zap_attribute_free(za);
 		} else {
 			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 			    enqueue_clones_cb, &ds->ds_object,
 			    DS_FIND_CHILDREN));
 		}
 	}
 
 out:
 	dsl_dataset_rele(ds, FTAG);
 }
 
 static int
 enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
 	(void) arg;
 	dsl_dataset_t *ds;
 	int err;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 	if (err)
 		return (err);
 
 	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		dsl_dataset_t *prev;
 		err = dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 		if (err) {
 			dsl_dataset_rele(ds, FTAG);
 			return (err);
 		}
 
 		/*
 		 * If this is a clone, we don't need to worry about it for now.
 		 */
 		if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
 			dsl_dataset_rele(ds, FTAG);
 			dsl_dataset_rele(prev, FTAG);
 			return (0);
 		}
 		dsl_dataset_rele(ds, FTAG);
 		ds = prev;
 	}
 
 	mutex_enter(&scn->scn_queue_lock);
 	scan_ds_queue_insert(scn, ds->ds_object,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 	mutex_exit(&scn->scn_queue_lock);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 void
 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
     ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	(void) tx;
 	const ddt_key_t *ddk = &ddlwe->ddlwe_key;
 	blkptr_t bp;
 	zbookmark_phys_t zb = { 0 };
 
 	if (!dsl_scan_is_running(scn))
 		return;
 
 	/*
 	 * This function is special because it is the only thing
 	 * that can add scan_io_t's to the vdev scan queues from
 	 * outside dsl_scan_sync(). For the most part this is ok
 	 * as long as it is called from within syncing context.
 	 * However, dsl_scan_sync() expects that no new sio's will
 	 * be added between when all the work for a scan is done
 	 * and the next txg when the scan is actually marked as
 	 * completed. This check ensures we do not issue new sio's
 	 * during this period.
 	 */
 	if (scn->scn_done_txg != 0)
 		return;
 
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 		uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v);
 
 		if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg)
 			continue;
 		ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp);
 
 		scn->scn_visited_this_txg++;
 		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
 	}
 }
 
 /*
  * Scrub/dedup interaction.
  *
  * If there are N references to a deduped block, we don't want to scrub it
  * N times -- ideally, we should scrub it exactly once.
  *
  * We leverage the fact that the dde's replication class (ddt_class_t)
  * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
  * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
  *
  * To prevent excess scrubbing, the scrub begins by walking the DDT
  * to find all blocks with refcnt > 1, and scrubs each of these once.
  * Since there are two replication classes which contain blocks with
  * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
  * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
  *
  * There would be nothing more to say if a block's refcnt couldn't change
  * during a scrub, but of course it can so we must account for changes
  * in a block's replication class.
  *
  * Here's an example of what can occur:
  *
  * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
  * when visited during the top-down scrub phase, it will be scrubbed twice.
  * This negates our scrub optimization, but is otherwise harmless.
  *
  * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
  * on each visit during the top-down scrub phase, it will never be scrubbed.
  * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
  * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
  * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
  * while a scrub is in progress, it scrubs the block right then.
  */
 static void
 dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
 	ddt_lightweight_entry_t ddlwe = {0};
 	int error;
 	uint64_t n = 0;
 
 	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &ddlwe)) == 0) {
 		ddt_t *ddt;
 
 		if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
 			break;
 		dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
 		    (longlong_t)ddb->ddb_class,
 		    (longlong_t)ddb->ddb_type,
 		    (longlong_t)ddb->ddb_checksum,
 		    (longlong_t)ddb->ddb_cursor);
 
 		/* There should be no pending changes to the dedup table */
 		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
 		ASSERT(avl_first(&ddt->ddt_tree) == NULL);
 
 		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx);
 		n++;
 
 		if (dsl_scan_check_suspend(scn, NULL))
 			break;
 	}
 
 	if (error == EAGAIN) {
 		dsl_scan_check_suspend(scn, NULL);
 		error = 0;
 
 		zfs_dbgmsg("waiting for ddt to become ready for scan "
 		    "on %s with class_max = %u; suspending=%u",
 		    scn->scn_dp->dp_spa->spa_name,
 		    (int)scn->scn_phys.scn_ddt_class_max,
 		    (int)scn->scn_suspending);
 	} else
 		zfs_dbgmsg("scanned %llu ddt entries on %s with "
 		    "class_max = %u; suspending=%u", (longlong_t)n,
 		    scn->scn_dp->dp_spa->spa_name,
 		    (int)scn->scn_phys.scn_ddt_class_max,
 		    (int)scn->scn_suspending);
 
 	ASSERT(error == 0 || error == ENOENT);
 	ASSERT(error != ENOENT ||
 	    ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
 }
 
 static uint64_t
 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
 {
 	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
 	if (ds->ds_is_snapshot)
 		return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
 	return (smt);
 }
 
 static void
 dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	scan_ds_t *sds;
 	dsl_pool_t *dp = scn->scn_dp;
 
 	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
 	    scn->scn_phys.scn_ddt_class_max) {
 		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
 		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
 		dsl_scan_ddt(scn, tx);
 		if (scn->scn_suspending)
 			return;
 	}
 
 	if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
 		/* First do the MOS & ORIGIN */
 
 		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
 		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
 		dsl_scan_visit_rootbp(scn, NULL,
 		    &dp->dp_meta_rootbp, tx);
 		if (scn->scn_suspending)
 			return;
 
 		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
 			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 			    enqueue_cb, NULL, DS_FIND_CHILDREN));
 		} else {
 			dsl_scan_visitds(scn,
 			    dp->dp_origin_snap->ds_object, tx);
 		}
 		ASSERT(!scn->scn_suspending);
 	} else if (scn->scn_phys.scn_bookmark.zb_objset !=
 	    ZB_DESTROYED_OBJSET) {
 		uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset;
 		/*
 		 * If we were suspended, continue from here. Note if the
 		 * ds we were suspended on was deleted, the zb_objset may
 		 * be -1, so we will skip this and find a new objset
 		 * below.
 		 */
 		dsl_scan_visitds(scn, dsobj, tx);
 		if (scn->scn_suspending)
 			return;
 	}
 
 	/*
 	 * In case we suspended right at the end of the ds, zero the
 	 * bookmark so we don't think that we're still trying to resume.
 	 */
 	memset(&scn->scn_phys.scn_bookmark, 0, sizeof (zbookmark_phys_t));
 
 	/*
 	 * Keep pulling things out of the dataset avl queue. Updates to the
 	 * persistent zap-object-as-queue happen only at checkpoints.
 	 */
 	while ((sds = avl_first(&scn->scn_queue)) != NULL) {
 		dsl_dataset_t *ds;
 		uint64_t dsobj = sds->sds_dsobj;
 		uint64_t txg = sds->sds_txg;
 
 		/* dequeue and free the ds from the queue */
 		scan_ds_queue_remove(scn, dsobj);
 		sds = NULL;
 
 		/* set up min / max txg */
 		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 		if (txg != 0) {
 			scn->scn_phys.scn_cur_min_txg =
 			    MAX(scn->scn_phys.scn_min_txg, txg);
 		} else {
 			scn->scn_phys.scn_cur_min_txg =
 			    MAX(scn->scn_phys.scn_min_txg,
 			    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 		}
 		scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
 		dsl_dataset_rele(ds, FTAG);
 
 		dsl_scan_visitds(scn, dsobj, tx);
 		if (scn->scn_suspending)
 			return;
 	}
 
 	/* No more objsets to fetch, we're done */
 	scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET;
 	ASSERT0(scn->scn_suspending);
 }
 
 static uint64_t
 dsl_scan_count_data_disks(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t i, leaves = 0;
 
 	for (i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *vd = rvd->vdev_child[i];
 		if (vd->vdev_islog || vd->vdev_isspare || vd->vdev_isl2cache)
 			continue;
 		leaves += vdev_get_ndisks(vd) - vdev_get_nparity(vd);
 	}
 	return (leaves);
 }
 
 static void
 scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp)
 {
 	int i;
 	uint64_t cur_size = 0;
 
 	for (i = 0; i < BP_GET_NDVAS(bp); i++) {
 		cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]);
 	}
 
 	q->q_total_zio_size_this_txg += cur_size;
 	q->q_zios_this_txg++;
 }
 
 static void
 scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start,
     uint64_t end)
 {
 	q->q_total_seg_size_this_txg += end - start;
 	q->q_segs_this_txg++;
 }
 
 static boolean_t
 scan_io_queue_check_suspend(dsl_scan_t *scn)
 {
 	/* See comment in dsl_scan_check_suspend() */
 	uint64_t curr_time_ns = gethrtime();
 	uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
 	uint64_t sync_time_ns = curr_time_ns -
 	    scn->scn_dp->dp_spa->spa_sync_starttime;
 	uint64_t dirty_min_bytes = zfs_dirty_data_max *
 	    zfs_vdev_async_write_active_min_dirty_percent / 100;
 	uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
 	    zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
 
 	return ((NSEC2MSEC(scan_time_ns) > mintime &&
 	    (scn->scn_dp->dp_dirty_total >= dirty_min_bytes ||
 	    txg_sync_waiting(scn->scn_dp) ||
 	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa));
 }
 
 /*
  * Given a list of scan_io_t's in io_list, this issues the I/Os out to
  * disk. This consumes the io_list and frees the scan_io_t's. This is
  * called when emptying queues, either when we're up against the memory
  * limit or when we have finished scanning. Returns B_TRUE if we stopped
  * processing the list before we finished. Any sios that were not issued
  * will remain in the io_list.
  */
 static boolean_t
 scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
 {
 	dsl_scan_t *scn = queue->q_scn;
 	scan_io_t *sio;
 	boolean_t suspended = B_FALSE;
 
 	while ((sio = list_head(io_list)) != NULL) {
 		blkptr_t bp;
 
 		if (scan_io_queue_check_suspend(scn)) {
 			suspended = B_TRUE;
 			break;
 		}
 
 		sio2bp(sio, &bp);
 		scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
 		    &sio->sio_zb, queue);
 		(void) list_remove_head(io_list);
 		scan_io_queues_update_zio_stats(queue, &bp);
 		sio_free(sio);
 	}
 	return (suspended);
 }
 
 /*
  * This function removes sios from an IO queue which reside within a given
  * zfs_range_seg_t and inserts them (in offset order) into a list. Note that
  * we only ever return a maximum of 32 sios at once. If there are more sios
  * to process within this segment that did not make it onto the list we
  * return B_TRUE and otherwise B_FALSE.
  */
 static boolean_t
 scan_io_queue_gather(dsl_scan_io_queue_t *queue, zfs_range_seg_t *rs,
     list_t *list)
 {
 	scan_io_t *srch_sio, *sio, *next_sio;
 	avl_index_t idx;
 	uint_t num_sios = 0;
 	int64_t bytes_issued = 0;
 
 	ASSERT(rs != NULL);
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
 	srch_sio = sio_alloc(1);
 	srch_sio->sio_nr_dvas = 1;
 	SIO_SET_OFFSET(srch_sio, zfs_rs_get_start(rs, queue->q_exts_by_addr));
 
 	/*
 	 * The exact start of the extent might not contain any matching zios,
 	 * so if that's the case, examine the next one in the tree.
 	 */
 	sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
 	sio_free(srch_sio);
 
 	if (sio == NULL)
 		sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
 
 	while (sio != NULL && SIO_GET_OFFSET(sio) < zfs_rs_get_end(rs,
 	    queue->q_exts_by_addr) && num_sios <= 32) {
 		ASSERT3U(SIO_GET_OFFSET(sio), >=, zfs_rs_get_start(rs,
 		    queue->q_exts_by_addr));
 		ASSERT3U(SIO_GET_END_OFFSET(sio), <=, zfs_rs_get_end(rs,
 		    queue->q_exts_by_addr));
 
 		next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
 		avl_remove(&queue->q_sios_by_addr, sio);
 		if (avl_is_empty(&queue->q_sios_by_addr))
 			atomic_add_64(&queue->q_scn->scn_queues_pending, -1);
 		queue->q_sio_memused -= SIO_GET_MUSED(sio);
 
 		bytes_issued += SIO_GET_ASIZE(sio);
 		num_sios++;
 		list_insert_tail(list, sio);
 		sio = next_sio;
 	}
 
 	/*
 	 * We limit the number of sios we process at once to 32 to avoid
 	 * biting off more than we can chew. If we didn't take everything
 	 * in the segment we update it to reflect the work we were able to
 	 * complete. Otherwise, we remove it from the range tree entirely.
 	 */
 	if (sio != NULL && SIO_GET_OFFSET(sio) < zfs_rs_get_end(rs,
 	    queue->q_exts_by_addr)) {
 		zfs_range_tree_adjust_fill(queue->q_exts_by_addr, rs,
 		    -bytes_issued);
 		zfs_range_tree_resize_segment(queue->q_exts_by_addr, rs,
 		    SIO_GET_OFFSET(sio), zfs_rs_get_end(rs,
 		    queue->q_exts_by_addr) - SIO_GET_OFFSET(sio));
 		queue->q_last_ext_addr = SIO_GET_OFFSET(sio);
 		return (B_TRUE);
 	} else {
 		uint64_t rstart = zfs_rs_get_start(rs, queue->q_exts_by_addr);
 		uint64_t rend = zfs_rs_get_end(rs, queue->q_exts_by_addr);
 		zfs_range_tree_remove(queue->q_exts_by_addr, rstart, rend -
 		    rstart);
 		queue->q_last_ext_addr = -1;
 		return (B_FALSE);
 	}
 }
 
 /*
  * This is called from the queue emptying thread and selects the next
  * extent from which we are to issue I/Os. The behavior of this function
  * depends on the state of the scan, the current memory consumption and
  * whether or not we are performing a scan shutdown.
  * 1) We select extents in an elevator algorithm (LBA-order) if the scan
  * 	needs to perform a checkpoint
  * 2) We select the largest available extent if we are up against the
  * 	memory limit.
  * 3) Otherwise we don't select any extents.
  */
 static zfs_range_seg_t *
 scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
 {
 	dsl_scan_t *scn = queue->q_scn;
 	zfs_range_tree_t *rt = queue->q_exts_by_addr;
 
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 	ASSERT(scn->scn_is_sorted);
 
 	if (!scn->scn_checkpointing && !scn->scn_clearing)
 		return (NULL);
 
 	/*
 	 * During normal clearing, we want to issue our largest segments
 	 * first, keeping IO as sequential as possible, and leaving the
 	 * smaller extents for later with the hope that they might eventually
 	 * grow to larger sequential segments. However, when the scan is
 	 * checkpointing, no new extents will be added to the sorting queue,
 	 * so the way we are sorted now is as good as it will ever get.
 	 * In this case, we instead switch to issuing extents in LBA order.
 	 */
 	if ((zfs_scan_issue_strategy < 1 && scn->scn_checkpointing) ||
 	    zfs_scan_issue_strategy == 1)
 		return (zfs_range_tree_first(rt));
 
 	/*
 	 * Try to continue previous extent if it is not completed yet.  After
 	 * shrink in scan_io_queue_gather() it may no longer be the best, but
 	 * otherwise we leave shorter remnant every txg.
 	 */
 	uint64_t start;
 	uint64_t size = 1ULL << rt->rt_shift;
 	zfs_range_seg_t *addr_rs;
 	if (queue->q_last_ext_addr != -1) {
 		start = queue->q_last_ext_addr;
 		addr_rs = zfs_range_tree_find(rt, start, size);
 		if (addr_rs != NULL)
 			return (addr_rs);
 	}
 
 	/*
 	 * Nothing to continue, so find new best extent.
 	 */
 	uint64_t *v = zfs_btree_first(&queue->q_exts_by_size, NULL);
 	if (v == NULL)
 		return (NULL);
 	queue->q_last_ext_addr = start = *v << rt->rt_shift;
 
 	/*
 	 * We need to get the original entry in the by_addr tree so we can
 	 * modify it.
 	 */
 	addr_rs = zfs_range_tree_find(rt, start, size);
 	ASSERT3P(addr_rs, !=, NULL);
 	ASSERT3U(zfs_rs_get_start(addr_rs, rt), ==, start);
 	ASSERT3U(zfs_rs_get_end(addr_rs, rt), >, start);
 	return (addr_rs);
 }
 
 static void
 scan_io_queues_run_one(void *arg)
 {
 	dsl_scan_io_queue_t *queue = arg;
 	kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
 	boolean_t suspended = B_FALSE;
 	zfs_range_seg_t *rs;
 	scan_io_t *sio;
 	zio_t *zio;
 	list_t sio_list;
 
 	ASSERT(queue->q_scn->scn_is_sorted);
 
 	list_create(&sio_list, sizeof (scan_io_t),
 	    offsetof(scan_io_t, sio_nodes.sio_list_node));
 	zio = zio_null(queue->q_scn->scn_zio_root, queue->q_scn->scn_dp->dp_spa,
 	    NULL, NULL, NULL, ZIO_FLAG_CANFAIL);
 	mutex_enter(q_lock);
 	queue->q_zio = zio;
 
 	/* Calculate maximum in-flight bytes for this vdev. */
 	queue->q_maxinflight_bytes = MAX(1, zfs_scan_vdev_limit *
 	    (vdev_get_ndisks(queue->q_vd) - vdev_get_nparity(queue->q_vd)));
 
 	/* reset per-queue scan statistics for this txg */
 	queue->q_total_seg_size_this_txg = 0;
 	queue->q_segs_this_txg = 0;
 	queue->q_total_zio_size_this_txg = 0;
 	queue->q_zios_this_txg = 0;
 
 	/* loop until we run out of time or sios */
 	while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) {
 		uint64_t seg_start = 0, seg_end = 0;
 		boolean_t more_left;
 
 		ASSERT(list_is_empty(&sio_list));
 
 		/* loop while we still have sios left to process in this rs */
 		do {
 			scan_io_t *first_sio, *last_sio;
 
 			/*
 			 * We have selected which extent needs to be
 			 * processed next. Gather up the corresponding sios.
 			 */
 			more_left = scan_io_queue_gather(queue, rs, &sio_list);
 			ASSERT(!list_is_empty(&sio_list));
 			first_sio = list_head(&sio_list);
 			last_sio = list_tail(&sio_list);
 
 			seg_end = SIO_GET_END_OFFSET(last_sio);
 			if (seg_start == 0)
 				seg_start = SIO_GET_OFFSET(first_sio);
 
 			/*
 			 * Issuing sios can take a long time so drop the
 			 * queue lock. The sio queue won't be updated by
 			 * other threads since we're in syncing context so
 			 * we can be sure that our trees will remain exactly
 			 * as we left them.
 			 */
 			mutex_exit(q_lock);
 			suspended = scan_io_queue_issue(queue, &sio_list);
 			mutex_enter(q_lock);
 
 			if (suspended)
 				break;
 		} while (more_left);
 
 		/* update statistics for debugging purposes */
 		scan_io_queues_update_seg_stats(queue, seg_start, seg_end);
 
 		if (suspended)
 			break;
 	}
 
 	/*
 	 * If we were suspended in the middle of processing,
 	 * requeue any unfinished sios and exit.
 	 */
 	while ((sio = list_remove_head(&sio_list)) != NULL)
 		scan_io_queue_insert_impl(queue, sio);
 
 	queue->q_zio = NULL;
 	mutex_exit(q_lock);
 	zio_nowait(zio);
 	list_destroy(&sio_list);
 }
 
 /*
  * Performs an emptying run on all scan queues in the pool. This just
  * punches out one thread per top-level vdev, each of which processes
  * only that vdev's scan queue. We can parallelize the I/O here because
  * we know that each queue's I/Os only affect its own top-level vdev.
  *
  * This function waits for the queue runs to complete, and must be
  * called from dsl_scan_sync (or in general, syncing context).
  */
 static void
 scan_io_queues_run(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 
 	ASSERT(scn->scn_is_sorted);
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (scn->scn_queues_pending == 0)
 		return;
 
 	if (scn->scn_taskq == NULL) {
 		int nthreads = spa->spa_root_vdev->vdev_children;
 
 		/*
 		 * We need to make this taskq *always* execute as many
 		 * threads in parallel as we have top-level vdevs and no
 		 * less, otherwise strange serialization of the calls to
 		 * scan_io_queues_run_one can occur during spa_sync runs
 		 * and that significantly impacts performance.
 		 */
 		scn->scn_taskq = taskq_create("dsl_scan_iss", nthreads,
 		    minclsyspri, nthreads, nthreads, TASKQ_PREPOPULATE);
 	}
 
 	for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
 
 		mutex_enter(&vd->vdev_scan_io_queue_lock);
 		if (vd->vdev_scan_io_queue != NULL) {
 			VERIFY(taskq_dispatch(scn->scn_taskq,
 			    scan_io_queues_run_one, vd->vdev_scan_io_queue,
 			    TQ_SLEEP) != TASKQID_INVALID);
 		}
 		mutex_exit(&vd->vdev_scan_io_queue_lock);
 	}
 
 	/*
 	 * Wait for the queues to finish issuing their IOs for this run
 	 * before we return. There may still be IOs in flight at this
 	 * point.
 	 */
 	taskq_wait(scn->scn_taskq);
 }
 
 static boolean_t
 dsl_scan_async_block_should_pause(dsl_scan_t *scn)
 {
 	uint64_t elapsed_nanosecs;
 
 	if (zfs_recover)
 		return (B_FALSE);
 
 	if (zfs_async_block_max_blocks != 0 &&
 	    scn->scn_visited_this_txg >= zfs_async_block_max_blocks) {
 		return (B_TRUE);
 	}
 
 	if (zfs_max_async_dedup_frees != 0 &&
 	    scn->scn_dedup_frees_this_txg >= zfs_max_async_dedup_frees) {
 		return (B_TRUE);
 	}
 
 	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
 	return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
 	    (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms &&
 	    txg_sync_waiting(scn->scn_dp)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa));
 }
 
 static int
 dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = arg;
 
 	if (!scn->scn_is_bptree ||
 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
 		if (dsl_scan_async_block_should_pause(scn))
 			return (SET_ERROR(ERESTART));
 	}
 
 	zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
 	    dmu_tx_get_txg(tx), bp, 0));
 	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
 	    -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
 	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
 	scn->scn_visited_this_txg++;
 	if (BP_GET_DEDUP(bp))
 		scn->scn_dedup_frees_this_txg++;
 	return (0);
 }
 
 static void
 dsl_scan_update_stats(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	uint64_t i;
 	uint64_t seg_size_total = 0, zio_size_total = 0;
 	uint64_t seg_count_total = 0, zio_count_total = 0;
 
 	for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
 		dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue;
 
 		if (queue == NULL)
 			continue;
 
 		seg_size_total += queue->q_total_seg_size_this_txg;
 		zio_size_total += queue->q_total_zio_size_this_txg;
 		seg_count_total += queue->q_segs_this_txg;
 		zio_count_total += queue->q_zios_this_txg;
 	}
 
 	if (seg_count_total == 0 || zio_count_total == 0) {
 		scn->scn_avg_seg_size_this_txg = 0;
 		scn->scn_avg_zio_size_this_txg = 0;
 		scn->scn_segs_this_txg = 0;
 		scn->scn_zios_this_txg = 0;
 		return;
 	}
 
 	scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total;
 	scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total;
 	scn->scn_segs_this_txg = seg_count_total;
 	scn->scn_zios_this_txg = zio_count_total;
 }
 
 static int
 bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT(!bp_freed);
 	return (dsl_scan_free_block_cb(arg, bp, tx));
 }
 
 static int
 dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT(!bp_freed);
 	dsl_scan_t *scn = arg;
 	const dva_t *dva = &bp->blk_dva[0];
 
 	if (dsl_scan_async_block_should_pause(scn))
 		return (SET_ERROR(ERESTART));
 
 	spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa,
 	    DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva),
 	    DVA_GET_ASIZE(dva), tx);
 	scn->scn_visited_this_txg++;
 	return (0);
 }
 
 boolean_t
 dsl_scan_active(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	uint64_t used = 0, comp, uncomp;
 	boolean_t clones_left;
 
 	if (spa->spa_load_state != SPA_LOAD_NONE)
 		return (B_FALSE);
 	if (spa_shutting_down(spa))
 		return (B_FALSE);
 	if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) ||
 	    (scn->scn_async_destroying && !scn->scn_async_stalled))
 		return (B_TRUE);
 
 	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
 		    &used, &comp, &uncomp);
 	}
 	clones_left = spa_livelist_delete_check(spa);
 	return ((used != 0) || (clones_left));
 }
 
 boolean_t
 dsl_errorscrub_active(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	if (spa->spa_load_state != SPA_LOAD_NONE)
 		return (B_FALSE);
 	if (spa_shutting_down(spa))
 		return (B_FALSE);
 	if (dsl_errorscrubbing(scn->scn_dp))
 		return (B_TRUE);
 	return (B_FALSE);
 }
 
 static boolean_t
 dsl_scan_check_deferred(vdev_t *vd)
 {
 	boolean_t need_resilver = B_FALSE;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		need_resilver |=
 		    dsl_scan_check_deferred(vd->vdev_child[c]);
 	}
 
 	if (!vdev_is_concrete(vd) || vd->vdev_aux ||
 	    !vd->vdev_ops->vdev_op_leaf)
 		return (need_resilver);
 
 	if (!vd->vdev_resilver_deferred)
 		need_resilver = B_TRUE;
 
 	return (need_resilver);
 }
 
 static boolean_t
 dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	vdev_t *vd;
 
 	vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
 
 	if (vd->vdev_ops == &vdev_indirect_ops) {
 		/*
 		 * The indirect vdev can point to multiple
 		 * vdevs.  For simplicity, always create
 		 * the resilver zio_t. zio_vdev_io_start()
 		 * will bypass the child resilver i/o's if
 		 * they are on vdevs that don't have DTL's.
 		 */
 		return (B_TRUE);
 	}
 
 	if (DVA_GET_GANG(dva)) {
 		/*
 		 * Gang members may be spread across multiple
 		 * vdevs, so the best estimate we have is the
 		 * scrub range, which has already been checked.
 		 * XXX -- it would be better to change our
 		 * allocation policy to ensure that all
 		 * gang members reside on the same vdev.
 		 */
 		return (B_TRUE);
 	}
 
 	/*
 	 * Check if the top-level vdev must resilver this offset.
 	 * When the offset does not intersect with a dirty leaf DTL
 	 * then it may be possible to skip the resilver IO.  The psize
 	 * is provided instead of asize to simplify the check for RAIDZ.
 	 */
 	if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth))
 		return (B_FALSE);
 
 	/*
 	 * Check that this top-level vdev has a device under it which
 	 * is resilvering and is not deferred.
 	 */
 	if (!dsl_scan_check_deferred(vd))
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 static int
 dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
 	int err = 0;
 
 	if (spa_suspend_async_destroy(spa))
 		return (0);
 
 	if (zfs_free_bpobj_enabled &&
 	    spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 		scn->scn_is_bptree = B_FALSE;
 		scn->scn_async_block_min_time_ms = zfs_free_min_time_ms;
 		scn->scn_zio_root = zio_root(spa, NULL,
 		    NULL, ZIO_FLAG_MUSTSUCCEED);
 		err = bpobj_iterate(&dp->dp_free_bpobj,
 		    bpobj_dsl_scan_free_block_cb, scn, tx);
 		VERIFY0(zio_wait(scn->scn_zio_root));
 		scn->scn_zio_root = NULL;
 
 		if (err != 0 && err != ERESTART)
 			zfs_panic_recover("error %u from bpobj_iterate()", err);
 	}
 
 	if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		ASSERT(scn->scn_async_destroying);
 		scn->scn_is_bptree = B_TRUE;
 		scn->scn_zio_root = zio_root(spa, NULL,
 		    NULL, ZIO_FLAG_MUSTSUCCEED);
 		err = bptree_iterate(dp->dp_meta_objset,
 		    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
 		VERIFY0(zio_wait(scn->scn_zio_root));
 		scn->scn_zio_root = NULL;
 
 		if (err == EIO || err == ECKSUM) {
 			err = 0;
 		} else if (err != 0 && err != ERESTART) {
 			zfs_panic_recover("error %u from "
 			    "traverse_dataset_destroyed()", err);
 		}
 
 		if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
 			/* finished; deactivate async destroy feature */
 			spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
 			ASSERT(!spa_feature_is_active(spa,
 			    SPA_FEATURE_ASYNC_DESTROY));
 			VERIFY0(zap_remove(dp->dp_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT,
 			    DMU_POOL_BPTREE_OBJ, tx));
 			VERIFY0(bptree_free(dp->dp_meta_objset,
 			    dp->dp_bptree_obj, tx));
 			dp->dp_bptree_obj = 0;
 			scn->scn_async_destroying = B_FALSE;
 			scn->scn_async_stalled = B_FALSE;
 		} else {
 			/*
 			 * If we didn't make progress, mark the async
 			 * destroy as stalled, so that we will not initiate
 			 * a spa_sync() on its behalf.  Note that we only
 			 * check this if we are not finished, because if the
 			 * bptree had no blocks for us to visit, we can
 			 * finish without "making progress".
 			 */
 			scn->scn_async_stalled =
 			    (scn->scn_visited_this_txg == 0);
 		}
 	}
 	if (scn->scn_visited_this_txg) {
 		zfs_dbgmsg("freed %llu blocks in %llums from "
 		    "free_bpobj/bptree on %s in txg %llu; err=%u",
 		    (longlong_t)scn->scn_visited_this_txg,
 		    (longlong_t)
 		    NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
 		    spa->spa_name, (longlong_t)tx->tx_txg, err);
 		scn->scn_visited_this_txg = 0;
 		scn->scn_dedup_frees_this_txg = 0;
 
 		/*
 		 * Write out changes to the DDT and the BRT that may be required
 		 * as a result of the blocks freed.  This ensures that the DDT
 		 * and the BRT are clean when a scrub/resilver runs.
 		 */
 		ddt_sync(spa, tx->tx_txg);
 		brt_sync(spa, tx->tx_txg);
 	}
 	if (err != 0)
 		return (err);
 	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
 	    zfs_free_leak_on_eio &&
 	    (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
 	    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 ||
 	    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
 		/*
 		 * We have finished background destroying, but there is still
 		 * some space left in the dp_free_dir. Transfer this leaked
 		 * space to the dp_leak_dir.
 		 */
 		if (dp->dp_leak_dir == NULL) {
 			rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 			(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
 			    LEAK_DIR_NAME, tx);
 			VERIFY0(dsl_pool_open_special_dir(dp,
 			    LEAK_DIR_NAME, &dp->dp_leak_dir));
 			rrw_exit(&dp->dp_config_rwlock, FTAG);
 		}
 		dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
 		    dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
 		    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
 		    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
 		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
 		    -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
 		    -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
 		    -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
 	}
 
 	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
 	    !spa_livelist_delete_check(spa)) {
 		/* finished; verify that space accounting went to zero */
 		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
 		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
 		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
 	}
 
 	spa_notify_waiters(spa);
 
 	EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj),
 	    0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_OBSOLETE_BPOBJ));
 	if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) {
 		ASSERT(spa_feature_is_active(dp->dp_spa,
 		    SPA_FEATURE_OBSOLETE_COUNTS));
 
 		scn->scn_is_bptree = B_FALSE;
 		scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms;
 		err = bpobj_iterate(&dp->dp_obsolete_bpobj,
 		    dsl_scan_obsolete_block_cb, scn, tx);
 		if (err != 0 && err != ERESTART)
 			zfs_panic_recover("error %u from bpobj_iterate()", err);
 
 		if (bpobj_is_empty(&dp->dp_obsolete_bpobj))
 			dsl_pool_destroy_obsolete_bpobj(dp, tx);
 	}
 	return (0);
 }
 
 static void
 name_to_bookmark(char *buf, zbookmark_phys_t *zb)
 {
 	zb->zb_objset = zfs_strtonum(buf, &buf);
 	ASSERT(*buf == ':');
 	zb->zb_object = zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == ':');
 	zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == ':');
 	zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == '\0');
 }
 
 static void
 name_to_object(char *buf, uint64_t *obj)
 {
 	*obj = zfs_strtonum(buf, &buf);
 	ASSERT(*buf == '\0');
 }
 
 static void
 read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	dsl_dataset_t *ds;
 	objset_t *os;
 	if (dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds) != 0)
 		return;
 
 	if (dmu_objset_from_ds(ds, &os) != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return;
 	}
 
 	/*
 	 * If the key is not loaded dbuf_dnode_findbp() will error out with
 	 * EACCES. However in that case dnode_hold() will eventually call
 	 * dbuf_read()->zio_wait() which may call spa_log_error(). This will
 	 * lead to a deadlock due to us holding the mutex spa_errlist_lock.
 	 * Avoid this by checking here if the keys are loaded, if not return.
 	 * If the keys are not loaded the head_errlog feature is meaningless
 	 * as we cannot figure out the birth txg of the block pointer.
 	 */
 	if (dsl_dataset_get_keystatus(ds->ds_dir) ==
 	    ZFS_KEYSTATUS_UNAVAILABLE) {
 		dsl_dataset_rele(ds, FTAG);
 		return;
 	}
 
 	dnode_t *dn;
 	blkptr_t bp;
 
 	if (dnode_hold(os, zb.zb_object, FTAG, &dn) != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return;
 	}
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	int error = dbuf_dnode_findbp(dn, zb.zb_level, zb.zb_blkid, &bp, NULL,
 	    NULL);
 
 	if (error) {
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_rele(dn, FTAG);
 		dsl_dataset_rele(ds, FTAG);
 		return;
 	}
 
 	if (!error && BP_IS_HOLE(&bp)) {
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_rele(dn, FTAG);
 		dsl_dataset_rele(ds, FTAG);
 		return;
 	}
 
 	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW |
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB;
 
 	/* If it's an intent log block, failure is expected. */
 	if (zb.zb_level == ZB_ZIL_LEVEL)
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	ASSERT(!BP_IS_EMBEDDED(&bp));
 	scan_exec_io(dp, &bp, zio_flags, &zb, NULL);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 /*
  * We keep track of the scrubbed error blocks in "count". This will be used
  * when deciding whether we exceeded zfs_scrub_error_blocks_per_txg. This
  * function is modelled after check_filesystem().
  */
 static int
 scrub_filesystem(spa_t *spa, uint64_t fs, zbookmark_err_phys_t *zep,
     int *count)
 {
 	dsl_dataset_t *ds;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	int error = dsl_dataset_hold_obj(dp, fs, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	uint64_t latest_txg;
 	uint64_t txg_to_consider = spa->spa_syncing_txg;
 	boolean_t check_snapshot = B_TRUE;
 
 	error = find_birth_txg(ds, zep, &latest_txg);
 
 	/*
 	 * If find_birth_txg() errors out, then err on the side of caution and
 	 * proceed. In worst case scenario scrub all objects. If zep->zb_birth
 	 * is 0 (e.g. in case of encryption with unloaded keys) also proceed to
 	 * scrub all objects.
 	 */
 	if (error == 0 && zep->zb_birth == latest_txg) {
 		/* Block neither free nor re written. */
 		zbookmark_phys_t zb;
 		zep_to_zb(fs, zep, &zb);
 		scn->scn_zio_root = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 		/* We have already acquired the config lock for spa */
 		read_by_block_level(scn, zb);
 
 		(void) zio_wait(scn->scn_zio_root);
 		scn->scn_zio_root = NULL;
 
 		scn->errorscrub_phys.dep_examined++;
 		scn->errorscrub_phys.dep_to_examine--;
 		(*count)++;
 		if ((*count) == zfs_scrub_error_blocks_per_txg ||
 		    dsl_error_scrub_check_suspend(scn, &zb)) {
 			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(EFAULT));
 		}
 
 		check_snapshot = B_FALSE;
 	} else if (error == 0) {
 		txg_to_consider = latest_txg;
 	}
 
 	/*
 	 * Retrieve the number of snapshots if the dataset is not a snapshot.
 	 */
 	uint64_t snap_count = 0;
 	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
 
 		error = zap_count(spa->spa_meta_objset,
 		    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
 
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 	}
 
 	if (snap_count == 0) {
 		/* Filesystem without snapshots. */
 		dsl_dataset_rele(ds, FTAG);
 		return (0);
 	}
 
 	uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 
 	dsl_dataset_rele(ds, FTAG);
 
 	/* Check only snapshots created from this file system. */
 	while (snap_obj != 0 && zep->zb_birth < snap_obj_txg &&
 	    snap_obj_txg <= txg_to_consider) {
 
 		error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds);
 		if (error != 0)
 			return (error);
 
 		if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != fs) {
 			snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 			snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 			dsl_dataset_rele(ds, FTAG);
 			continue;
 		}
 
 		boolean_t affected = B_TRUE;
 		if (check_snapshot) {
 			uint64_t blk_txg;
 			error = find_birth_txg(ds, zep, &blk_txg);
 
 			/*
 			 * Scrub the snapshot also when zb_birth == 0 or when
 			 * find_birth_txg() returns an error.
 			 */
 			affected = (error == 0 && zep->zb_birth == blk_txg) ||
 			    (error != 0) || (zep->zb_birth == 0);
 		}
 
 		/* Scrub snapshots. */
 		if (affected) {
 			zbookmark_phys_t zb;
 			zep_to_zb(snap_obj, zep, &zb);
 			scn->scn_zio_root = zio_root(spa, NULL, NULL,
 			    ZIO_FLAG_CANFAIL);
 			/* We have already acquired the config lock for spa */
 			read_by_block_level(scn, zb);
 
 			(void) zio_wait(scn->scn_zio_root);
 			scn->scn_zio_root = NULL;
 
 			scn->errorscrub_phys.dep_examined++;
 			scn->errorscrub_phys.dep_to_examine--;
 			(*count)++;
 			if ((*count) == zfs_scrub_error_blocks_per_txg ||
 			    dsl_error_scrub_check_suspend(scn, &zb)) {
 				dsl_dataset_rele(ds, FTAG);
 				return (EFAULT);
 			}
 		}
 		snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 		snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		dsl_dataset_rele(ds, FTAG);
 	}
 	return (0);
 }
 
 void
 dsl_errorscrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	spa_t *spa = dp->dp_spa;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	/*
 	 * Only process scans in sync pass 1.
 	 */
 
 	if (spa_sync_pass(spa) > 1)
 		return;
 
 	/*
 	 * If the spa is shutting down, then stop scanning. This will
 	 * ensure that the scan does not dirty any new data during the
 	 * shutdown phase.
 	 */
 	if (spa_shutting_down(spa))
 		return;
 
 	if (!dsl_errorscrub_active(scn) || dsl_errorscrub_is_paused(scn)) {
 		return;
 	}
 
 	if (dsl_scan_resilvering(scn->scn_dp)) {
 		/* cancel the error scrub if resilver started */
 		dsl_scan_cancel(scn->scn_dp);
 		return;
 	}
 
 	spa->spa_scrub_active = B_TRUE;
 	scn->scn_sync_start_time = gethrtime();
 
 	/*
 	 * zfs_scan_suspend_progress can be set to disable scrub progress.
 	 * See more detailed comment in dsl_scan_sync().
 	 */
 	if (zfs_scan_suspend_progress) {
 		uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
 		int mintime = zfs_scrub_min_time_ms;
 
 		while (zfs_scan_suspend_progress &&
 		    !txg_sync_waiting(scn->scn_dp) &&
 		    !spa_shutting_down(scn->scn_dp->dp_spa) &&
 		    NSEC2MSEC(scan_time_ns) < mintime) {
 			delay(hz);
 			scan_time_ns = gethrtime() - scn->scn_sync_start_time;
 		}
 		return;
 	}
 
 	int i = 0;
 	zap_attribute_t *za;
 	zbookmark_phys_t *zb;
 	boolean_t limit_exceeded = B_FALSE;
 
 	za = zap_attribute_alloc();
 	zb = kmem_zalloc(sizeof (zbookmark_phys_t), KM_SLEEP);
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0;
 		    zap_cursor_advance(&scn->errorscrub_cursor)) {
 			name_to_bookmark(za->za_name, zb);
 
 			scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 			    NULL, ZIO_FLAG_CANFAIL);
 			dsl_pool_config_enter(dp, FTAG);
 			read_by_block_level(scn, *zb);
 			dsl_pool_config_exit(dp, FTAG);
 
 			(void) zio_wait(scn->scn_zio_root);
 			scn->scn_zio_root = NULL;
 
 			scn->errorscrub_phys.dep_examined += 1;
 			scn->errorscrub_phys.dep_to_examine -= 1;
 			i++;
 			if (i == zfs_scrub_error_blocks_per_txg ||
 			    dsl_error_scrub_check_suspend(scn, zb)) {
 				limit_exceeded = B_TRUE;
 				break;
 			}
 		}
 
 		if (!limit_exceeded)
 			dsl_errorscrub_done(scn, B_TRUE, tx);
 
 		dsl_errorscrub_sync_state(scn, tx);
 		zap_attribute_free(za);
 		kmem_free(zb, sizeof (*zb));
 		return;
 	}
 
 	int error = 0;
 	for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0;
 	    zap_cursor_advance(&scn->errorscrub_cursor)) {
 
 		zap_cursor_t *head_ds_cursor;
 		zap_attribute_t *head_ds_attr;
 		zbookmark_err_phys_t head_ds_block;
 
 		head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
 		head_ds_attr = zap_attribute_alloc();
 
 		uint64_t head_ds_err_obj = za->za_first_integer;
 		uint64_t head_ds;
 		name_to_object(za->za_name, &head_ds);
 		boolean_t config_held = B_FALSE;
 		uint64_t top_affected_fs;
 
 		for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset,
 		    head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor,
 		    head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) {
 
 			name_to_errphys(head_ds_attr->za_name, &head_ds_block);
 
 			/*
 			 * In case we are called from spa_sync the pool
 			 * config is already held.
 			 */
 			if (!dsl_pool_config_held(dp)) {
 				dsl_pool_config_enter(dp, FTAG);
 				config_held = B_TRUE;
 			}
 
 			error = find_top_affected_fs(spa,
 			    head_ds, &head_ds_block, &top_affected_fs);
 			if (error)
 				break;
 
 			error = scrub_filesystem(spa, top_affected_fs,
 			    &head_ds_block, &i);
 
 			if (error == SET_ERROR(EFAULT)) {
 				limit_exceeded = B_TRUE;
 				break;
 			}
 		}
 
 		zap_cursor_fini(head_ds_cursor);
 		kmem_free(head_ds_cursor, sizeof (*head_ds_cursor));
 		zap_attribute_free(head_ds_attr);
 
 		if (config_held)
 			dsl_pool_config_exit(dp, FTAG);
 	}
 
 	zap_attribute_free(za);
 	kmem_free(zb, sizeof (*zb));
 	if (!limit_exceeded)
 		dsl_errorscrub_done(scn, B_TRUE, tx);
 
 	dsl_errorscrub_sync_state(scn, tx);
 }
 
 /*
  * This is the primary entry point for scans that is called from syncing
  * context. Scans must happen entirely during syncing context so that we
  * can guarantee that blocks we are currently scanning will not change out
  * from under us. While a scan is active, this function controls how quickly
  * transaction groups proceed, instead of the normal handling provided by
  * txg_sync_thread().
  */
 void
 dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	int err = 0;
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
 	state_sync_type_t sync_type = SYNC_OPTIONAL;
 	int restart_early = 0;
 
 	if (spa->spa_resilver_deferred) {
 		uint64_t to_issue, issued;
 
 		if (!spa_feature_is_active(dp->dp_spa,
 		    SPA_FEATURE_RESILVER_DEFER))
 			spa_feature_incr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
 
 		/*
 		 * See print_scan_scrub_resilver_status() issued/total_i
 		 * @ cmd/zpool/zpool_main.c
 		 */
 		to_issue =
 		    scn->scn_phys.scn_to_examine - scn->scn_phys.scn_skipped;
 		issued =
 		    scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
 		restart_early =
 		    zfs_resilver_disable_defer ||
 		    (issued < (to_issue * zfs_resilver_defer_percent / 100));
 	}
 
 	/*
 	 * Only process scans in sync pass 1.
 	 */
 	if (spa_sync_pass(spa) > 1)
 		return;
 
 
 	/*
 	 * Check for scn_restart_txg before checking spa_load_state, so
 	 * that we can restart an old-style scan while the pool is being
 	 * imported (see dsl_scan_init). We also restart scans if there
 	 * is a deferred resilver and the user has manually disabled
 	 * deferred resilvers via zfs_resilver_disable_defer, or if the
 	 * current scan progress is below zfs_resilver_defer_percent.
 	 */
 	if (dsl_scan_restarting(scn, tx) || restart_early) {
 		setup_sync_arg_t setup_sync_arg = {
 			.func = POOL_SCAN_SCRUB,
 			.txgstart = 0,
 			.txgend = 0,
 		};
 		dsl_scan_done(scn, B_FALSE, tx);
 		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
 			setup_sync_arg.func = POOL_SCAN_RESILVER;
 		zfs_dbgmsg("restarting scan func=%u on %s txg=%llu early=%d",
 		    setup_sync_arg.func, dp->dp_spa->spa_name,
 		    (longlong_t)tx->tx_txg, restart_early);
 		dsl_scan_setup_sync(&setup_sync_arg, tx);
 	}
 
 	/*
 	 * If the spa is shutting down, then stop scanning. This will
 	 * ensure that the scan does not dirty any new data during the
 	 * shutdown phase.
 	 */
 	if (spa_shutting_down(spa))
 		return;
 
 	/*
 	 * If the scan is inactive due to a stalled async destroy, try again.
 	 */
 	if (!scn->scn_async_stalled && !dsl_scan_active(scn))
 		return;
 
 	/* reset scan statistics */
 	scn->scn_visited_this_txg = 0;
 	scn->scn_dedup_frees_this_txg = 0;
 	scn->scn_holes_this_txg = 0;
 	scn->scn_lt_min_this_txg = 0;
 	scn->scn_gt_max_this_txg = 0;
 	scn->scn_ddt_contained_this_txg = 0;
 	scn->scn_objsets_visited_this_txg = 0;
 	scn->scn_avg_seg_size_this_txg = 0;
 	scn->scn_segs_this_txg = 0;
 	scn->scn_avg_zio_size_this_txg = 0;
 	scn->scn_zios_this_txg = 0;
 	scn->scn_suspending = B_FALSE;
 	scn->scn_sync_start_time = gethrtime();
 	spa->spa_scrub_active = B_TRUE;
 
 	/*
 	 * First process the async destroys.  If we suspend, don't do
 	 * any scrubbing or resilvering.  This ensures that there are no
 	 * async destroys while we are scanning, so the scan code doesn't
 	 * have to worry about traversing it.  It is also faster to free the
 	 * blocks than to scrub them.
 	 */
 	err = dsl_process_async_destroys(dp, tx);
 	if (err != 0)
 		return;
 
 	if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
 		return;
 
 	/*
 	 * Wait a few txgs after importing to begin scanning so that
 	 * we can get the pool imported quickly.
 	 */
 	if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS)
 		return;
 
 	/*
 	 * zfs_scan_suspend_progress can be set to disable scan progress.
 	 * We don't want to spin the txg_sync thread, so we add a delay
 	 * here to simulate the time spent doing a scan. This is mostly
 	 * useful for testing and debugging.
 	 */
 	if (zfs_scan_suspend_progress) {
 		uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
 		uint_t mintime = (scn->scn_phys.scn_func ==
 		    POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms :
 		    zfs_scrub_min_time_ms;
 
 		while (zfs_scan_suspend_progress &&
 		    !txg_sync_waiting(scn->scn_dp) &&
 		    !spa_shutting_down(scn->scn_dp->dp_spa) &&
 		    NSEC2MSEC(scan_time_ns) < mintime) {
 			delay(hz);
 			scan_time_ns = gethrtime() - scn->scn_sync_start_time;
 		}
 		return;
 	}
 
 	/*
 	 * Disabled by default, set zfs_scan_report_txgs to report
 	 * average performance over the last zfs_scan_report_txgs TXGs.
 	 */
 	if (zfs_scan_report_txgs != 0 &&
 	    tx->tx_txg % zfs_scan_report_txgs == 0) {
 		scn->scn_issued_before_pass += spa->spa_scan_pass_issued;
 		spa_scan_stat_init(spa);
 	}
 
 	/*
 	 * It is possible to switch from unsorted to sorted at any time,
 	 * but afterwards the scan will remain sorted unless reloaded from
 	 * a checkpoint after a reboot.
 	 */
 	if (!zfs_scan_legacy) {
 		scn->scn_is_sorted = B_TRUE;
 		if (scn->scn_last_checkpoint == 0)
 			scn->scn_last_checkpoint = ddi_get_lbolt();
 	}
 
 	/*
 	 * For sorted scans, determine what kind of work we will be doing
 	 * this txg based on our memory limitations and whether or not we
 	 * need to perform a checkpoint.
 	 */
 	if (scn->scn_is_sorted) {
 		/*
 		 * If we are over our checkpoint interval, set scn_clearing
 		 * so that we can begin checkpointing immediately. The
 		 * checkpoint allows us to save a consistent bookmark
 		 * representing how much data we have scrubbed so far.
 		 * Otherwise, use the memory limit to determine if we should
 		 * scan for metadata or start issue scrub IOs. We accumulate
 		 * metadata until we hit our hard memory limit at which point
 		 * we issue scrub IOs until we are at our soft memory limit.
 		 */
 		if (scn->scn_checkpointing ||
 		    ddi_get_lbolt() - scn->scn_last_checkpoint >
 		    SEC_TO_TICK(zfs_scan_checkpoint_intval)) {
 			if (!scn->scn_checkpointing)
 				zfs_dbgmsg("begin scan checkpoint for %s",
 				    spa->spa_name);
 
 			scn->scn_checkpointing = B_TRUE;
 			scn->scn_clearing = B_TRUE;
 		} else {
 			boolean_t should_clear = dsl_scan_should_clear(scn);
 			if (should_clear && !scn->scn_clearing) {
 				zfs_dbgmsg("begin scan clearing for %s",
 				    spa->spa_name);
 				scn->scn_clearing = B_TRUE;
 			} else if (!should_clear && scn->scn_clearing) {
 				zfs_dbgmsg("finish scan clearing for %s",
 				    spa->spa_name);
 				scn->scn_clearing = B_FALSE;
 			}
 		}
 	} else {
 		ASSERT0(scn->scn_checkpointing);
 		ASSERT0(scn->scn_clearing);
 	}
 
 	if (!scn->scn_clearing && scn->scn_done_txg == 0) {
 		/* Need to scan metadata for more blocks to scrub */
 		dsl_scan_phys_t *scnp = &scn->scn_phys;
 		taskqid_t prefetch_tqid;
 
 		/*
 		 * Calculate the max number of in-flight bytes for pool-wide
 		 * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
 		 * Limits for the issuing phase are done per top-level vdev and
 		 * are handled separately.
 		 */
 		scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
 		    zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));
 
 		if (scnp->scn_ddt_bookmark.ddb_class <=
 		    scnp->scn_ddt_class_max) {
 			ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark));
 			zfs_dbgmsg("doing scan sync for %s txg %llu; "
 			    "ddt bm=%llu/%llu/%llu/%llx",
 			    spa->spa_name,
 			    (longlong_t)tx->tx_txg,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
 		} else {
 			zfs_dbgmsg("doing scan sync for %s txg %llu; "
 			    "bm=%llu/%llu/%llu/%llu",
 			    spa->spa_name,
 			    (longlong_t)tx->tx_txg,
 			    (longlong_t)scnp->scn_bookmark.zb_objset,
 			    (longlong_t)scnp->scn_bookmark.zb_object,
 			    (longlong_t)scnp->scn_bookmark.zb_level,
 			    (longlong_t)scnp->scn_bookmark.zb_blkid);
 		}
 
 		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 		    NULL, ZIO_FLAG_CANFAIL);
 
 		scn->scn_prefetch_stop = B_FALSE;
 		prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq,
 		    dsl_scan_prefetch_thread, scn, TQ_SLEEP);
 		ASSERT(prefetch_tqid != TASKQID_INVALID);
 
 		dsl_pool_config_enter(dp, FTAG);
 		dsl_scan_visit(scn, tx);
 		dsl_pool_config_exit(dp, FTAG);
 
 		mutex_enter(&dp->dp_spa->spa_scrub_lock);
 		scn->scn_prefetch_stop = B_TRUE;
 		cv_broadcast(&spa->spa_scrub_io_cv);
 		mutex_exit(&dp->dp_spa->spa_scrub_lock);
 
 		taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid);
 		(void) zio_wait(scn->scn_zio_root);
 		scn->scn_zio_root = NULL;
 
 		zfs_dbgmsg("scan visited %llu blocks of %s in %llums "
 		    "(%llu os's, %llu holes, %llu < mintxg, "
 		    "%llu in ddt, %llu > maxtxg)",
 		    (longlong_t)scn->scn_visited_this_txg,
 		    spa->spa_name,
 		    (longlong_t)NSEC2MSEC(gethrtime() -
 		    scn->scn_sync_start_time),
 		    (longlong_t)scn->scn_objsets_visited_this_txg,
 		    (longlong_t)scn->scn_holes_this_txg,
 		    (longlong_t)scn->scn_lt_min_this_txg,
 		    (longlong_t)scn->scn_ddt_contained_this_txg,
 		    (longlong_t)scn->scn_gt_max_this_txg);
 
 		if (!scn->scn_suspending) {
 			ASSERT0(avl_numnodes(&scn->scn_queue));
 			scn->scn_done_txg = tx->tx_txg + 1;
 			if (scn->scn_is_sorted) {
 				scn->scn_checkpointing = B_TRUE;
 				scn->scn_clearing = B_TRUE;
 				scn->scn_issued_before_pass +=
 				    spa->spa_scan_pass_issued;
 				spa_scan_stat_init(spa);
 			}
 			zfs_dbgmsg("scan complete for %s txg %llu",
 			    spa->spa_name,
 			    (longlong_t)tx->tx_txg);
 		}
 	} else if (scn->scn_is_sorted && scn->scn_queues_pending != 0) {
 		ASSERT(scn->scn_clearing);
 
 		/* need to issue scrubbing IOs from per-vdev queues */
 		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 		    NULL, ZIO_FLAG_CANFAIL);
 		scan_io_queues_run(scn);
 		(void) zio_wait(scn->scn_zio_root);
 		scn->scn_zio_root = NULL;
 
 		/* calculate and dprintf the current memory usage */
 		(void) dsl_scan_should_clear(scn);
 		dsl_scan_update_stats(scn);
 
 		zfs_dbgmsg("scan issued %llu blocks for %s (%llu segs) "
 		    "in %llums (avg_block_size = %llu, avg_seg_size = %llu)",
 		    (longlong_t)scn->scn_zios_this_txg,
 		    spa->spa_name,
 		    (longlong_t)scn->scn_segs_this_txg,
 		    (longlong_t)NSEC2MSEC(gethrtime() -
 		    scn->scn_sync_start_time),
 		    (longlong_t)scn->scn_avg_zio_size_this_txg,
 		    (longlong_t)scn->scn_avg_seg_size_this_txg);
 	} else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) {
 		/* Finished with everything. Mark the scrub as complete */
 		zfs_dbgmsg("scan issuing complete txg %llu for %s",
 		    (longlong_t)tx->tx_txg,
 		    spa->spa_name);
 		ASSERT3U(scn->scn_done_txg, !=, 0);
 		ASSERT0(spa->spa_scrub_inflight);
 		ASSERT0(scn->scn_queues_pending);
 		dsl_scan_done(scn, B_TRUE, tx);
 		sync_type = SYNC_MANDATORY;
 	}
 
 	dsl_scan_sync_state(scn, tx, sync_type);
 }
 
 static void
 count_block_issued(spa_t *spa, const blkptr_t *bp, boolean_t all)
 {
 	/*
 	 * Don't count embedded bp's, since we already did the work of
 	 * scanning these when we scanned the containing block.
 	 */
 	if (BP_IS_EMBEDDED(bp))
 		return;
 
 	/*
 	 * Update the spa's stats on how many bytes we have issued.
 	 * Sequential scrubs create a zio for each DVA of the bp. Each
 	 * of these will include all DVAs for repair purposes, but the
 	 * zio code will only try the first one unless there is an issue.
 	 * Therefore, we should only count the first DVA for these IOs.
 	 */
 	atomic_add_64(&spa->spa_scan_pass_issued,
 	    all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
 }
 
 static void
 count_block_skipped(dsl_scan_t *scn, const blkptr_t *bp, boolean_t all)
 {
 	if (BP_IS_EMBEDDED(bp))
 		return;
 	atomic_add_64(&scn->scn_phys.scn_skipped,
 	    all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
 }
 
 static void
 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
 {
 	/*
 	 * If we resume after a reboot, zab will be NULL; don't record
 	 * incomplete stats in that case.
 	 */
 	if (zab == NULL)
 		return;
 
 	for (int i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
 		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
 
 		if (t & DMU_OT_NEWTYPE)
 			t = DMU_OT_OTHER;
 		zfs_blkstat_t *zb = &zab->zab_type[l][t];
 		int equal;
 
 		zb->zb_count++;
 		zb->zb_asize += BP_GET_ASIZE(bp);
 		zb->zb_lsize += BP_GET_LSIZE(bp);
 		zb->zb_psize += BP_GET_PSIZE(bp);
 		zb->zb_gangs += BP_COUNT_GANG(bp);
 
 		switch (BP_GET_NDVAS(bp)) {
 		case 2:
 			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1]))
 				zb->zb_ditto_2_of_2_samevdev++;
 			break;
 		case 3:
 			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2]));
 			if (equal == 1)
 				zb->zb_ditto_2_of_3_samevdev++;
 			else if (equal == 3)
 				zb->zb_ditto_3_of_3_samevdev++;
 			break;
 		}
 	}
 }
 
 static void
 scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
 {
 	avl_index_t idx;
 	dsl_scan_t *scn = queue->q_scn;
 
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
 	if (unlikely(avl_is_empty(&queue->q_sios_by_addr)))
 		atomic_add_64(&scn->scn_queues_pending, 1);
 	if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
 		/* block is already scheduled for reading */
 		sio_free(sio);
 		return;
 	}
 	avl_insert(&queue->q_sios_by_addr, sio, idx);
 	queue->q_sio_memused += SIO_GET_MUSED(sio);
 	zfs_range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio),
 	    SIO_GET_ASIZE(sio));
 }
 
 /*
  * Given all the info we got from our metadata scanning process, we
  * construct a scan_io_t and insert it into the scan sorting queue. The
  * I/O must already be suitable for us to process. This is controlled
  * by dsl_scan_enqueue().
  */
 static void
 scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
     int zio_flags, const zbookmark_phys_t *zb)
 {
 	scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp));
 
 	ASSERT0(BP_IS_GANG(bp));
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
 	bp2sio(bp, sio, dva_i);
 	sio->sio_flags = zio_flags;
 	sio->sio_zb = *zb;
 
 	queue->q_last_ext_addr = -1;
 	scan_io_queue_insert_impl(queue, sio);
 }
 
 /*
  * Given a set of I/O parameters as discovered by the metadata traversal
  * process, attempts to place the I/O into the sorted queues (if allowed),
  * or immediately executes the I/O.
  */
 static void
 dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
     const zbookmark_phys_t *zb)
 {
 	spa_t *spa = dp->dp_spa;
 
 	ASSERT(!BP_IS_EMBEDDED(bp));
 
 	/*
 	 * Gang blocks are hard to issue sequentially, so we just issue them
 	 * here immediately instead of queuing them.
 	 */
 	if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) {
 		scan_exec_io(dp, bp, zio_flags, zb, NULL);
 		return;
 	}
 
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		dva_t dva;
 		vdev_t *vdev;
 
 		dva = bp->blk_dva[i];
 		vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva));
 		ASSERT(vdev != NULL);
 
 		mutex_enter(&vdev->vdev_scan_io_queue_lock);
 		if (vdev->vdev_scan_io_queue == NULL)
 			vdev->vdev_scan_io_queue = scan_io_queue_create(vdev);
 		ASSERT(dp->dp_scan != NULL);
 		scan_io_queue_insert(vdev->vdev_scan_io_queue, bp,
 		    i, zio_flags, zb);
 		mutex_exit(&vdev->vdev_scan_io_queue_lock);
 	}
 }
 
 static int
 dsl_scan_scrub_cb(dsl_pool_t *dp,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
 	uint64_t phys_birth = BP_GET_BIRTH(bp);
 	size_t psize = BP_GET_PSIZE(bp);
 	boolean_t needs_io = B_FALSE;
 	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
 
 	count_block(dp->dp_blkstats, bp);
 	if (phys_birth <= scn->scn_phys.scn_min_txg ||
 	    phys_birth >= scn->scn_phys.scn_max_txg) {
 		count_block_skipped(scn, bp, B_TRUE);
 		return (0);
 	}
 
 	/* Embedded BP's have phys_birth==0, so we reject them above. */
 	ASSERT(!BP_IS_EMBEDDED(bp));
 
 	ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
 	if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
 		zio_flags |= ZIO_FLAG_SCRUB;
 		needs_io = B_TRUE;
 	} else {
 		ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
 		zio_flags |= ZIO_FLAG_RESILVER;
 		needs_io = B_FALSE;
 	}
 
 	/* If it's an intent log block, failure is expected. */
 	if (zb->zb_level == ZB_ZIL_LEVEL)
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
 		const dva_t *dva = &bp->blk_dva[d];
 
 		/*
 		 * Keep track of how much data we've examined so that
 		 * zpool(8) status can make useful progress reports.
 		 */
 		uint64_t asize = DVA_GET_ASIZE(dva);
 		scn->scn_phys.scn_examined += asize;
 		spa->spa_scan_pass_exam += asize;
 
 		/* if it's a resilver, this may not be in the target range */
 		if (!needs_io)
 			needs_io = dsl_scan_need_resilver(spa, dva, psize,
 			    phys_birth);
 	}
 
 	if (needs_io && !zfs_no_scrub_io) {
 		dsl_scan_enqueue(dp, bp, zio_flags, zb);
 	} else {
 		count_block_skipped(scn, bp, B_TRUE);
 	}
 
 	/* do not relocate this block */
 	return (0);
 }
 
 static void
 dsl_scan_scrub_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	dsl_scan_io_queue_t *queue = zio->io_private;
 
 	abd_free(zio->io_abd);
 
 	if (queue == NULL) {
 		mutex_enter(&spa->spa_scrub_lock);
 		ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
 		spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
 		cv_broadcast(&spa->spa_scrub_io_cv);
 		mutex_exit(&spa->spa_scrub_lock);
 	} else {
 		mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock);
 		ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp));
 		queue->q_inflight_bytes -= BP_GET_PSIZE(bp);
 		cv_broadcast(&queue->q_zio_cv);
 		mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock);
 	}
 
 	if (zio->io_error && (zio->io_error != ECKSUM ||
 	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
 		if (dsl_errorscrubbing(spa->spa_dsl_pool) &&
 		    !dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) {
 			atomic_inc_64(&spa->spa_dsl_pool->dp_scan
 			    ->errorscrub_phys.dep_errors);
 		} else {
 			atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys
 			    .scn_errors);
 		}
 	}
 }
 
 /*
  * Given a scanning zio's information, executes the zio. The zio need
  * not necessarily be only sortable, this function simply executes the
  * zio, no matter what it is. The optional queue argument allows the
  * caller to specify that they want per top level vdev IO rate limiting
  * instead of the legacy global limiting.
  */
 static void
 scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
     const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue)
 {
 	spa_t *spa = dp->dp_spa;
 	dsl_scan_t *scn = dp->dp_scan;
 	size_t size = BP_GET_PSIZE(bp);
 	abd_t *data = abd_alloc_for_io(size, B_FALSE);
 	zio_t *pio;
 
 	if (queue == NULL) {
 		ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 		spa->spa_scrub_inflight += BP_GET_PSIZE(bp);
 		mutex_exit(&spa->spa_scrub_lock);
 		pio = scn->scn_zio_root;
 	} else {
 		kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
 
 		ASSERT3U(queue->q_maxinflight_bytes, >, 0);
 		mutex_enter(q_lock);
 		while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
 			cv_wait(&queue->q_zio_cv, q_lock);
 		queue->q_inflight_bytes += BP_GET_PSIZE(bp);
 		pio = queue->q_zio;
 		mutex_exit(q_lock);
 	}
 
 	ASSERT(pio != NULL);
 	count_block_issued(spa, bp, queue == NULL);
 	zio_nowait(zio_read(pio, spa, bp, data, size, dsl_scan_scrub_done,
 	    queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
 }
 
 /*
  * This is the primary extent sorting algorithm. We balance two parameters:
  * 1) how many bytes of I/O are in an extent
  * 2) how well the extent is filled with I/O (as a fraction of its total size)
  * Since we allow extents to have gaps between their constituent I/Os, it's
  * possible to have a fairly large extent that contains the same amount of
  * I/O bytes than a much smaller extent, which just packs the I/O more tightly.
  * The algorithm sorts based on a score calculated from the extent's size,
  * the relative fill volume (in %) and a "fill weight" parameter that controls
  * the split between whether we prefer larger extents or more well populated
  * extents:
  *
  * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT)
  *
  * Example:
  * 1) assume extsz = 64 MiB
  * 2) assume fill = 32 MiB (extent is half full)
  * 3) assume fill_weight = 3
  * 4)	SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100
  *	SCORE = 32M + (50 * 3 * 32M) / 100
  *	SCORE = 32M + (4800M / 100)
  *	SCORE = 32M + 48M
  *	         ^     ^
  *	         |     +--- final total relative fill-based score
  *	         +--------- final total fill-based score
  *	SCORE = 80M
  *
  * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards
  * extents that are more completely filled (in a 3:2 ratio) vs just larger.
  * Note that as an optimization, we replace multiplication and division by
  * 100 with bitshifting by 7 (which effectively multiplies and divides by 128).
  *
  * Since we do not care if one extent is only few percent better than another,
  * compress the score into 6 bits via binary logarithm AKA highbit64() and
  * put into otherwise unused due to ashift high bits of offset.  This allows
  * to reduce q_exts_by_size B-tree elements to only 64 bits and compare them
  * with single operation.  Plus it makes scrubs more sequential and reduces
  * chances that minor extent change move it within the B-tree.
  */
 __attribute__((always_inline)) inline
 static int
 ext_size_compare(const void *x, const void *y)
 {
 	const uint64_t *a = x, *b = y;
 
 	return (TREE_CMP(*a, *b));
 }
 
 ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t,
     ext_size_compare)
 
 static void
 ext_size_create(zfs_range_tree_t *rt, void *arg)
 {
 	(void) rt;
 	zfs_btree_t *size_tree = arg;
 
 	zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf,
 	    sizeof (uint64_t));
 }
 
 static void
 ext_size_destroy(zfs_range_tree_t *rt, void *arg)
 {
 	(void) rt;
 	zfs_btree_t *size_tree = arg;
 	ASSERT0(zfs_btree_numnodes(size_tree));
 
 	zfs_btree_destroy(size_tree);
 }
 
 static uint64_t
-ext_size_value(zfs_range_tree_t *rt, range_seg_gap_t *rsg)
+ext_size_value(zfs_range_tree_t *rt, zfs_range_seg_gap_t *rsg)
 {
 	(void) rt;
 	uint64_t size = rsg->rs_end - rsg->rs_start;
 	uint64_t score = rsg->rs_fill + ((((rsg->rs_fill << 7) / size) *
 	    fill_weight * rsg->rs_fill) >> 7);
 	ASSERT3U(rt->rt_shift, >=, 8);
 	return (((uint64_t)(64 - highbit64(score)) << 56) | rsg->rs_start);
 }
 
 static void
 ext_size_add(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg)
 {
 	zfs_btree_t *size_tree = arg;
 	ASSERT3U(rt->rt_type, ==, ZFS_RANGE_SEG_GAP);
-	uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs);
+	uint64_t v = ext_size_value(rt, (zfs_range_seg_gap_t *)rs);
 	zfs_btree_add(size_tree, &v);
 }
 
 static void
 ext_size_remove(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg)
 {
 	zfs_btree_t *size_tree = arg;
 	ASSERT3U(rt->rt_type, ==, ZFS_RANGE_SEG_GAP);
-	uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs);
+	uint64_t v = ext_size_value(rt, (zfs_range_seg_gap_t *)rs);
 	zfs_btree_remove(size_tree, &v);
 }
 
 static void
 ext_size_vacate(zfs_range_tree_t *rt, void *arg)
 {
 	zfs_btree_t *size_tree = arg;
 	zfs_btree_clear(size_tree);
 	zfs_btree_destroy(size_tree);
 
 	ext_size_create(rt, arg);
 }
 
 static const zfs_range_tree_ops_t ext_size_ops = {
 	.rtop_create = ext_size_create,
 	.rtop_destroy = ext_size_destroy,
 	.rtop_add = ext_size_add,
 	.rtop_remove = ext_size_remove,
 	.rtop_vacate = ext_size_vacate
 };
 
 /*
  * Comparator for the q_sios_by_addr tree. Sorting is simply performed
  * based on LBA-order (from lowest to highest).
  */
 static int
 sio_addr_compare(const void *x, const void *y)
 {
 	const scan_io_t *a = x, *b = y;
 
 	return (TREE_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b)));
 }
 
 /* IO queues are created on demand when they are needed. */
 static dsl_scan_io_queue_t *
 scan_io_queue_create(vdev_t *vd)
 {
 	dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
 	dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP);
 
 	q->q_scn = scn;
 	q->q_vd = vd;
 	q->q_sio_memused = 0;
 	q->q_last_ext_addr = -1;
 	cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
 	q->q_exts_by_addr = zfs_range_tree_create_gap(&ext_size_ops,
 	    ZFS_RANGE_SEG_GAP, &q->q_exts_by_size, 0, vd->vdev_ashift,
 	    zfs_scan_max_ext_gap);
 	avl_create(&q->q_sios_by_addr, sio_addr_compare,
 	    sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
 
 	return (q);
 }
 
 /*
  * Destroys a scan queue and all segments and scan_io_t's contained in it.
  * No further execution of I/O occurs, anything pending in the queue is
  * simply freed without being executed.
  */
 void
 dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
 {
 	dsl_scan_t *scn = queue->q_scn;
 	scan_io_t *sio;
 	void *cookie = NULL;
 
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
 	if (!avl_is_empty(&queue->q_sios_by_addr))
 		atomic_add_64(&scn->scn_queues_pending, -1);
 	while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
 	    NULL) {
 		ASSERT(zfs_range_tree_contains(queue->q_exts_by_addr,
 		    SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)));
 		queue->q_sio_memused -= SIO_GET_MUSED(sio);
 		sio_free(sio);
 	}
 
 	ASSERT0(queue->q_sio_memused);
 	zfs_range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
 	zfs_range_tree_destroy(queue->q_exts_by_addr);
 	avl_destroy(&queue->q_sios_by_addr);
 	cv_destroy(&queue->q_zio_cv);
 
 	kmem_free(queue, sizeof (*queue));
 }
 
 /*
  * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is
  * called on behalf of vdev_top_transfer when creating or destroying
  * a mirror vdev due to zpool attach/detach.
  */
 void
 dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd)
 {
 	mutex_enter(&svd->vdev_scan_io_queue_lock);
 	mutex_enter(&tvd->vdev_scan_io_queue_lock);
 
 	VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL);
 	tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue;
 	svd->vdev_scan_io_queue = NULL;
 	if (tvd->vdev_scan_io_queue != NULL)
 		tvd->vdev_scan_io_queue->q_vd = tvd;
 
 	mutex_exit(&tvd->vdev_scan_io_queue_lock);
 	mutex_exit(&svd->vdev_scan_io_queue_lock);
 }
 
 static void
 scan_io_queues_destroy(dsl_scan_t *scn)
 {
 	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
 
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *tvd = rvd->vdev_child[i];
 
 		mutex_enter(&tvd->vdev_scan_io_queue_lock);
 		if (tvd->vdev_scan_io_queue != NULL)
 			dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue);
 		tvd->vdev_scan_io_queue = NULL;
 		mutex_exit(&tvd->vdev_scan_io_queue_lock);
 	}
 }
 
 static void
 dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	vdev_t *vdev;
 	kmutex_t *q_lock;
 	dsl_scan_io_queue_t *queue;
 	scan_io_t *srch_sio, *sio;
 	avl_index_t idx;
 	uint64_t start, size;
 
 	vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i]));
 	ASSERT(vdev != NULL);
 	q_lock = &vdev->vdev_scan_io_queue_lock;
 	queue = vdev->vdev_scan_io_queue;
 
 	mutex_enter(q_lock);
 	if (queue == NULL) {
 		mutex_exit(q_lock);
 		return;
 	}
 
 	srch_sio = sio_alloc(BP_GET_NDVAS(bp));
 	bp2sio(bp, srch_sio, dva_i);
 	start = SIO_GET_OFFSET(srch_sio);
 	size = SIO_GET_ASIZE(srch_sio);
 
 	/*
 	 * We can find the zio in two states:
 	 * 1) Cold, just sitting in the queue of zio's to be issued at
 	 *	some point in the future. In this case, all we do is
 	 *	remove the zio from the q_sios_by_addr tree, decrement
 	 *	its data volume from the containing zfs_range_seg_t and
 	 *	resort the q_exts_by_size tree to reflect that the
 	 *	zfs_range_seg_t has lost some of its 'fill'. We don't shorten
 	 *	the zfs_range_seg_t - this is usually rare enough not to be
 	 *	worth the extra hassle of trying keep track of precise
 	 *	extent boundaries.
 	 * 2) Hot, where the zio is currently in-flight in
 	 *	dsl_scan_issue_ios. In this case, we can't simply
 	 *	reach in and stop the in-flight zio's, so we instead
 	 *	block the caller. Eventually, dsl_scan_issue_ios will
 	 *	be done with issuing the zio's it gathered and will
 	 *	signal us.
 	 */
 	sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
 	sio_free(srch_sio);
 
 	if (sio != NULL) {
 		blkptr_t tmpbp;
 
 		/* Got it while it was cold in the queue */
 		ASSERT3U(start, ==, SIO_GET_OFFSET(sio));
 		ASSERT3U(size, ==, SIO_GET_ASIZE(sio));
 		avl_remove(&queue->q_sios_by_addr, sio);
 		if (avl_is_empty(&queue->q_sios_by_addr))
 			atomic_add_64(&scn->scn_queues_pending, -1);
 		queue->q_sio_memused -= SIO_GET_MUSED(sio);
 
 		ASSERT(zfs_range_tree_contains(queue->q_exts_by_addr, start,
 		    size));
 		zfs_range_tree_remove_fill(queue->q_exts_by_addr, start, size);
 
 		/* count the block as though we skipped it */
 		sio2bp(sio, &tmpbp);
 		count_block_skipped(scn, &tmpbp, B_FALSE);
 
 		sio_free(sio);
 	}
 	mutex_exit(q_lock);
 }
 
 /*
  * Callback invoked when a zio_free() zio is executing. This needs to be
  * intercepted to prevent the zio from deallocating a particular portion
  * of disk space and it then getting reallocated and written to, while we
  * still have it queued up for processing.
  */
 void
 dsl_scan_freed(spa_t *spa, const blkptr_t *bp)
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	ASSERT(!BP_IS_EMBEDDED(bp));
 	ASSERT(scn != NULL);
 	if (!dsl_scan_is_running(scn))
 		return;
 
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++)
 		dsl_scan_freed_dva(spa, bp, i);
 }
 
 /*
  * Check if a vdev needs resilvering (non-empty DTL), if so, and resilver has
  * not started, start it. Otherwise, only restart if max txg in DTL range is
  * greater than the max txg in the current scan. If the DTL max is less than
  * the scan max, then the vdev has not missed any new data since the resilver
  * started, so a restart is not needed.
  */
 void
 dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd)
 {
 	uint64_t min, max;
 
 	if (!vdev_resilver_needed(vd, &min, &max))
 		return;
 
 	if (!dsl_scan_resilvering(dp)) {
 		spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER);
 		return;
 	}
 
 	if (max <= dp->dp_scan->scn_phys.scn_max_txg)
 		return;
 
 	/* restart is needed, check if it can be deferred */
 	if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))
 		vdev_defer_resilver(vd);
 	else
 		spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER);
 }
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, U64, ZMOD_RW,
 	"Max bytes in flight per leaf vdev for scrubs and resilvers");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, UINT, ZMOD_RW,
 	"Min millisecs to scrub per txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, UINT, ZMOD_RW,
 	"Min millisecs to obsolete per txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, UINT, ZMOD_RW,
 	"Min millisecs to free per txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, UINT, ZMOD_RW,
 	"Min millisecs to resilver per txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_suspend_progress, INT, ZMOD_RW,
 	"Set to prevent scans from progressing");
 
 ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_io, INT, ZMOD_RW,
 	"Set to disable scrub I/O");
 
 ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_prefetch, INT, ZMOD_RW,
 	"Set to disable scrub prefetching");
 
 ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, U64, ZMOD_RW,
 	"Max number of blocks freed in one txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, U64, ZMOD_RW,
 	"Max number of dedup blocks freed in one txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW,
 	"Enable processing of the free_bpobj");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_blkstats, INT, ZMOD_RW,
 	"Enable block statistics calculation during scrub");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, UINT, ZMOD_RW,
 	"Fraction of RAM for scan hard limit");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, UINT, ZMOD_RW,
 	"IO issuing strategy during scrubbing. 0 = default, 1 = LBA, 2 = size");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_legacy, INT, ZMOD_RW,
 	"Scrub using legacy non-sequential method");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, UINT, ZMOD_RW,
 	"Scan progress on-disk checkpointing interval");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, U64, ZMOD_RW,
 	"Max gap in bytes between sequential scrub / resilver I/Os");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, UINT, ZMOD_RW,
 	"Fraction of hard limit used as soft limit");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW,
 	"Tunable to attempt to reduce lock contention");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, UINT, ZMOD_RW,
 	"Tunable to adjust bias towards more filled segments during scans");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW,
 	"Tunable to report resilver performance over the last N txgs");
 
 ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW,
 	"Process all resilvers immediately");
 
 ZFS_MODULE_PARAM(zfs, zfs_, resilver_defer_percent, UINT, ZMOD_RW,
 	"Issued IO percent complete after which resilvers are deferred");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, UINT, ZMOD_RW,
 	"Error blocks to be scrubbed in one txg");
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 10546798824a..e3c9afbd6e41 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -1,6302 +1,6302 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/space_map.h>
 #include <sys/metaslab_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/zio.h>
 #include <sys/spa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/zap.h>
 #include <sys/btree.h>
 
 #define	GANG_ALLOCATION(flags) \
 	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
 
 /*
  * Metaslab granularity, in bytes. This is roughly similar to what would be
  * referred to as the "stripe size" in traditional RAID arrays. In normal
  * operation, we will try to write this amount of data to each disk before
  * moving on to the next top-level vdev.
  */
 static uint64_t metaslab_aliquot = 1024 * 1024;
 
 /*
  * For testing, make some blocks above a certain size be gang blocks.
  */
 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
 
 /*
  * Of blocks of size >= metaslab_force_ganging, actually gang them this often.
  */
 uint_t metaslab_force_ganging_pct = 3;
 
 /*
  * In pools where the log space map feature is not enabled we touch
  * multiple metaslabs (and their respective space maps) with each
  * transaction group. Thus, we benefit from having a small space map
  * block size since it allows us to issue more I/O operations scattered
  * around the disk. So a sane default for the space map block size
  * is 8~16K.
  */
 int zfs_metaslab_sm_blksz_no_log = (1 << 14);
 
 /*
  * When the log space map feature is enabled, we accumulate a lot of
  * changes per metaslab that are flushed once in a while so we benefit
  * from a bigger block size like 128K for the metaslab space maps.
  */
 int zfs_metaslab_sm_blksz_with_log = (1 << 17);
 
 /*
  * The in-core space map representation is more compact than its on-disk form.
  * The zfs_condense_pct determines how much more compact the in-core
  * space map representation must be before we compact it on-disk.
  * Values should be greater than or equal to 100.
  */
 uint_t zfs_condense_pct = 200;
 
 /*
  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  * space used on disk. In particular, a space map uses data in increments of
  * MAX(1 << ashift, space_map_blksz), so a metaslab might use the
  * same number of blocks after condensing. Since the goal of condensing is to
  * reduce the number of IOPs required to read the space map, we only want to
  * condense when we can be sure we will reduce the number of blocks used by the
  * space map. Unfortunately, we cannot precisely compute whether or not this is
  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
  * we apply the following heuristic: do not condense a spacemap unless the
  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
  * blocks.
  */
 static const int zfs_metaslab_condense_block_threshold = 4;
 
 /*
  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  * be eligible for allocation. The value is defined as a percentage of
  * free space. Metaslab groups that have more free space than
  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  * a metaslab group's free space is less than or equal to the
  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  * groups are allowed to accept allocations. Gang blocks are always
  * eligible to allocate on any metaslab group. The default value of 0 means
  * no metaslab group will be excluded based on this criterion.
  */
 static uint_t zfs_mg_noalloc_threshold = 0;
 
 /*
  * Metaslab groups are considered eligible for allocations if their
  * fragmentation metric (measured as a percentage) is less than or
  * equal to zfs_mg_fragmentation_threshold. If a metaslab group
  * exceeds this threshold then it will be skipped unless all metaslab
  * groups within the metaslab class have also crossed this threshold.
  *
  * This tunable was introduced to avoid edge cases where we continue
  * allocating from very fragmented disks in our pool while other, less
  * fragmented disks, exists. On the other hand, if all disks in the
  * pool are uniformly approaching the threshold, the threshold can
  * be a speed bump in performance, where we keep switching the disks
  * that we allocate from (e.g. we allocate some segments from disk A
  * making it bypassing the threshold while freeing segments from disk
  * B getting its fragmentation below the threshold).
  *
  * Empirically, we've seen that our vdev selection for allocations is
  * good enough that fragmentation increases uniformly across all vdevs
  * the majority of the time. Thus we set the threshold percentage high
  * enough to avoid hitting the speed bump on pools that are being pushed
  * to the edge.
  */
 static uint_t zfs_mg_fragmentation_threshold = 95;
 
 /*
  * Allow metaslabs to keep their active state as long as their fragmentation
  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
  * active metaslab that exceeds this threshold will no longer keep its active
  * status allowing better metaslabs to be selected.
  */
 static uint_t zfs_metaslab_fragmentation_threshold = 77;
 
 /*
  * When set will load all metaslabs when pool is first opened.
  */
 int metaslab_debug_load = B_FALSE;
 
 /*
  * When set will prevent metaslabs from being unloaded.
  */
 static int metaslab_debug_unload = B_FALSE;
 
 /*
  * Minimum size which forces the dynamic allocator to change
  * it's allocation strategy.  Once the space map cannot satisfy
  * an allocation of this size then it switches to using more
  * aggressive strategy (i.e search by size rather than offset).
  */
 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 
 /*
  * The minimum free space, in percent, which must be available
  * in a space map to continue allocations in a first-fit fashion.
  * Once the space map's free space drops below this level we dynamically
  * switch to using best-fit allocations.
  */
 uint_t metaslab_df_free_pct = 4;
 
 /*
  * Maximum distance to search forward from the last offset. Without this
  * limit, fragmented pools can see >100,000 iterations and
  * metaslab_block_picker() becomes the performance limiting factor on
  * high-performance storage.
  *
  * With the default setting of 16MB, we typically see less than 500
  * iterations, even with very fragmented, ashift=9 pools. The maximum number
  * of iterations possible is:
  *     metaslab_df_max_search / (2 * (1<<ashift))
  * With the default setting of 16MB this is 16*1024 (with ashift=9) or
  * 2048 (with ashift=12).
  */
 static uint_t metaslab_df_max_search = 16 * 1024 * 1024;
 
 /*
  * Forces the metaslab_block_picker function to search for at least this many
  * segments forwards until giving up on finding a segment that the allocation
  * will fit into.
  */
 static const uint32_t metaslab_min_search_count = 100;
 
 /*
  * If we are not searching forward (due to metaslab_df_max_search,
  * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
  * controls what segment is used.  If it is set, we will use the largest free
  * segment.  If it is not set, we will use a segment of exactly the requested
  * size (or larger).
  */
 static int metaslab_df_use_largest_segment = B_FALSE;
 
 /*
  * These tunables control how long a metaslab will remain loaded after the
  * last allocation from it.  A metaslab can't be unloaded until at least
  * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds
  * have elapsed.  However, zfs_metaslab_mem_limit may cause it to be
  * unloaded sooner.  These settings are intended to be generous -- to keep
  * metaslabs loaded for a long time, reducing the rate of metaslab loading.
  */
 static uint_t metaslab_unload_delay = 32;
 static uint_t metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
 
 /*
  * Max number of metaslabs per group to preload.
  */
 uint_t metaslab_preload_limit = 10;
 
 /*
  * Enable/disable preloading of metaslab.
  */
 static int metaslab_preload_enabled = B_TRUE;
 
 /*
  * Enable/disable fragmentation weighting on metaslabs.
  */
 static int metaslab_fragmentation_factor_enabled = B_TRUE;
 
 /*
  * Enable/disable lba weighting (i.e. outer tracks are given preference).
  */
 static int metaslab_lba_weighting_enabled = B_TRUE;
 
 /*
  * Enable/disable metaslab group biasing.
  */
 static int metaslab_bias_enabled = B_TRUE;
 
 /*
  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
  */
 static const boolean_t zfs_remap_blkptr_enable = B_TRUE;
 
 /*
  * Enable/disable segment-based metaslab selection.
  */
 static int zfs_metaslab_segment_weight_enabled = B_TRUE;
 
 /*
  * When using segment-based metaslab selection, we will continue
  * allocating from the active metaslab until we have exhausted
  * zfs_metaslab_switch_threshold of its buckets.
  */
 static int zfs_metaslab_switch_threshold = 2;
 
 /*
  * Internal switch to enable/disable the metaslab allocation tracing
  * facility.
  */
 static const boolean_t metaslab_trace_enabled = B_FALSE;
 
 /*
  * Maximum entries that the metaslab allocation tracing facility will keep
  * in a given list when running in non-debug mode. We limit the number
  * of entries in non-debug mode to prevent us from using up too much memory.
  * The limit should be sufficiently large that we don't expect any allocation
  * to every exceed this value. In debug mode, the system will panic if this
  * limit is ever reached allowing for further investigation.
  */
 static const uint64_t metaslab_trace_max_entries = 5000;
 
 /*
  * Maximum number of metaslabs per group that can be disabled
  * simultaneously.
  */
 static const int max_disabled_ms = 3;
 
 /*
  * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
  * To avoid 64-bit overflow, don't set above UINT32_MAX.
  */
 static uint64_t zfs_metaslab_max_size_cache_sec = 1 * 60 * 60; /* 1 hour */
 
 /*
  * Maximum percentage of memory to use on storing loaded metaslabs. If loading
  * a metaslab would take it over this percentage, the oldest selected metaslab
  * is automatically unloaded.
  */
 static uint_t zfs_metaslab_mem_limit = 25;
 
 /*
  * Force the per-metaslab range trees to use 64-bit integers to store
  * segments. Used for debugging purposes.
  */
 static const boolean_t zfs_metaslab_force_large_segs = B_FALSE;
 
 /*
  * By default we only store segments over a certain size in the size-sorted
  * metaslab trees (ms_allocatable_by_size and
  * ms_unflushed_frees_by_size). This dramatically reduces memory usage and
  * improves load and unload times at the cost of causing us to use slightly
  * larger segments than we would otherwise in some cases.
  */
 static const uint32_t metaslab_by_size_min_shift = 14;
 
 /*
  * If not set, we will first try normal allocation.  If that fails then
  * we will do a gang allocation.  If that fails then we will do a "try hard"
  * gang allocation.  If that fails then we will have a multi-layer gang
  * block.
  *
  * If set, we will first try normal allocation.  If that fails then
  * we will do a "try hard" allocation.  If that fails we will do a gang
  * allocation.  If that fails we will do a "try hard" gang allocation.  If
  * that fails then we will have a multi-layer gang block.
  */
 static int zfs_metaslab_try_hard_before_gang = B_FALSE;
 
 /*
  * When not trying hard, we only consider the best zfs_metaslab_find_max_tries
  * metaslabs.  This improves performance, especially when there are many
  * metaslabs per vdev and the allocation can't actually be satisfied (so we
  * would otherwise iterate all the metaslabs).  If there is a metaslab with a
  * worse weight but it can actually satisfy the allocation, we won't find it
  * until trying hard.  This may happen if the worse metaslab is not loaded
  * (and the true weight is better than we have calculated), or due to weight
  * bucketization.  E.g. we are looking for a 60K segment, and the best
  * metaslabs all have free segments in the 32-63K bucket, but the best
  * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a
  * subsequent metaslab has ms_max_size >60KB (but fewer segments in this
  * bucket, and therefore a lower weight).
  */
 static uint_t zfs_metaslab_find_max_tries = 100;
 
 static uint64_t metaslab_weight(metaslab_t *, boolean_t);
 static void metaslab_set_fragmentation(metaslab_t *, boolean_t);
 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 
 static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
 static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
 static unsigned int metaslab_idx_func(multilist_t *, void *);
 static void metaslab_evict(metaslab_t *, uint64_t);
 static void metaslab_rt_add(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
     void *arg);
 kmem_cache_t *metaslab_alloc_trace_cache;
 
 typedef struct metaslab_stats {
 	kstat_named_t metaslabstat_trace_over_limit;
 	kstat_named_t metaslabstat_reload_tree;
 	kstat_named_t metaslabstat_too_many_tries;
 	kstat_named_t metaslabstat_try_hard;
 } metaslab_stats_t;
 
 static metaslab_stats_t metaslab_stats = {
 	{ "trace_over_limit",		KSTAT_DATA_UINT64 },
 	{ "reload_tree",		KSTAT_DATA_UINT64 },
 	{ "too_many_tries",		KSTAT_DATA_UINT64 },
 	{ "try_hard",			KSTAT_DATA_UINT64 },
 };
 
 #define	METASLABSTAT_BUMP(stat) \
 	atomic_inc_64(&metaslab_stats.stat.value.ui64);
 
 
 static kstat_t *metaslab_ksp;
 
 void
 metaslab_stat_init(void)
 {
 	ASSERT(metaslab_alloc_trace_cache == NULL);
 	metaslab_alloc_trace_cache = kmem_cache_create(
 	    "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
 	    0, NULL, NULL, NULL, NULL, NULL, 0);
 	metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats",
 	    "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) /
 	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 	if (metaslab_ksp != NULL) {
 		metaslab_ksp->ks_data = &metaslab_stats;
 		kstat_install(metaslab_ksp);
 	}
 }
 
 void
 metaslab_stat_fini(void)
 {
 	if (metaslab_ksp != NULL) {
 		kstat_delete(metaslab_ksp);
 		metaslab_ksp = NULL;
 	}
 
 	kmem_cache_destroy(metaslab_alloc_trace_cache);
 	metaslab_alloc_trace_cache = NULL;
 }
 
 /*
  * ==========================================================================
  * Metaslab classes
  * ==========================================================================
  */
 metaslab_class_t *
 metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops)
 {
 	metaslab_class_t *mc;
 
 	mc = kmem_zalloc(offsetof(metaslab_class_t,
 	    mc_allocator[spa->spa_alloc_count]), KM_SLEEP);
 
 	mc->mc_spa = spa;
 	mc->mc_ops = ops;
 	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 	multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t),
 	    offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
 		mca->mca_rotor = NULL;
 		zfs_refcount_create_tracked(&mca->mca_alloc_slots);
 	}
 
 	return (mc);
 }
 
 void
 metaslab_class_destroy(metaslab_class_t *mc)
 {
 	spa_t *spa = mc->mc_spa;
 
 	ASSERT(mc->mc_alloc == 0);
 	ASSERT(mc->mc_deferred == 0);
 	ASSERT(mc->mc_space == 0);
 	ASSERT(mc->mc_dspace == 0);
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
 		ASSERT(mca->mca_rotor == NULL);
 		zfs_refcount_destroy(&mca->mca_alloc_slots);
 	}
 	mutex_destroy(&mc->mc_lock);
 	multilist_destroy(&mc->mc_metaslab_txg_list);
 	kmem_free(mc, offsetof(metaslab_class_t,
 	    mc_allocator[spa->spa_alloc_count]));
 }
 
 int
 metaslab_class_validate(metaslab_class_t *mc)
 {
 	metaslab_group_t *mg;
 	vdev_t *vd;
 
 	/*
 	 * Must hold one of the spa_config locks.
 	 */
 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 
 	if ((mg = mc->mc_allocator[0].mca_rotor) == NULL)
 		return (0);
 
 	do {
 		vd = mg->mg_vd;
 		ASSERT(vd->vdev_mg != NULL);
 		ASSERT3P(vd->vdev_top, ==, vd);
 		ASSERT3P(mg->mg_class, ==, mc);
 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 	} while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor);
 
 	return (0);
 }
 
 static void
 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 {
 	atomic_add_64(&mc->mc_alloc, alloc_delta);
 	atomic_add_64(&mc->mc_deferred, defer_delta);
 	atomic_add_64(&mc->mc_space, space_delta);
 	atomic_add_64(&mc->mc_dspace, dspace_delta);
 }
 
 uint64_t
 metaslab_class_get_alloc(metaslab_class_t *mc)
 {
 	return (mc->mc_alloc);
 }
 
 uint64_t
 metaslab_class_get_deferred(metaslab_class_t *mc)
 {
 	return (mc->mc_deferred);
 }
 
 uint64_t
 metaslab_class_get_space(metaslab_class_t *mc)
 {
 	return (mc->mc_space);
 }
 
 uint64_t
 metaslab_class_get_dspace(metaslab_class_t *mc)
 {
 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 }
 
 void
 metaslab_class_histogram_verify(metaslab_class_t *mc)
 {
 	spa_t *spa = mc->mc_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t *mc_hist;
 	int i;
 
 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 		return;
 
-	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
+	mc_hist = kmem_zalloc(sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE,
 	    KM_SLEEP);
 
 	mutex_enter(&mc->mc_lock);
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = vdev_get_mg(tvd, mc);
 
 		/*
 		 * Skip any holes, uninitialized top-levels, or
 		 * vdevs that are not in this metalab class.
 		 */
 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		IMPLY(mg == mg->mg_vd->vdev_log_mg,
 		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 
-		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+		for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++)
 			mc_hist[i] += mg->mg_histogram[i];
 	}
 
-	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+	for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
 		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 	}
 
 	mutex_exit(&mc->mc_lock);
-	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
+	kmem_free(mc_hist, sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE);
 }
 
 /*
  * Calculate the metaslab class's fragmentation metric. The metric
  * is weighted based on the space contribution of each metaslab group.
  * The return value will be a number between 0 and 100 (inclusive), or
  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
  * zfs_frag_table for more information about the metric.
  */
 uint64_t
 metaslab_class_fragmentation(metaslab_class_t *mc)
 {
 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 	uint64_t fragmentation = 0;
 
 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		/*
 		 * Skip any holes, uninitialized top-levels,
 		 * or vdevs that are not in this metalab class.
 		 */
 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		/*
 		 * If a metaslab group does not contain a fragmentation
 		 * metric then just bail out.
 		 */
 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 			return (ZFS_FRAG_INVALID);
 		}
 
 		/*
 		 * Determine how much this metaslab_group is contributing
 		 * to the overall pool fragmentation metric.
 		 */
 		fragmentation += mg->mg_fragmentation *
 		    metaslab_group_get_space(mg);
 	}
 	fragmentation /= metaslab_class_get_space(mc);
 
 	ASSERT3U(fragmentation, <=, 100);
 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 	return (fragmentation);
 }
 
 /*
  * Calculate the amount of expandable space that is available in
  * this metaslab class. If a device is expanded then its expandable
  * space will be the amount of allocatable space that is currently not
  * part of this metaslab class.
  */
 uint64_t
 metaslab_class_expandable_space(metaslab_class_t *mc)
 {
 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 	uint64_t space = 0;
 
 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		/*
 		 * Calculate if we have enough space to add additional
 		 * metaslabs. We report the expandable space in terms
 		 * of the metaslab size since that's the unit of expansion.
 		 */
 		space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize,
 		    1ULL << tvd->vdev_ms_shift, uint64_t);
 	}
 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 	return (space);
 }
 
 void
 metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 {
 	multilist_t *ml = &mc->mc_metaslab_txg_list;
 	hrtime_t now = gethrtime();
 	for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
 		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 
 			/*
 			 * If the metaslab has been removed from the list
 			 * (which could happen if we were at the memory limit
 			 * and it was evicted during this loop), then we can't
 			 * proceed and we should restart the sublist.
 			 */
 			if (!multilist_link_active(&msp->ms_class_txg_node)) {
 				mutex_exit(&msp->ms_lock);
 				i--;
 				break;
 			}
 			mls = multilist_sublist_lock_idx(ml, i);
 			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 			multilist_sublist_unlock(mls);
 			if (txg >
 			    msp->ms_selected_txg + metaslab_unload_delay &&
 			    now > msp->ms_selected_time +
 			    MSEC2NSEC(metaslab_unload_delay_ms) &&
 			    (msp->ms_allocator == -1 ||
 			    !metaslab_preload_enabled)) {
 				metaslab_evict(msp, txg);
 			} else {
 				/*
 				 * Once we've hit a metaslab selected too
 				 * recently to evict, we're done evicting for
 				 * now.
 				 */
 				mutex_exit(&msp->ms_lock);
 				break;
 			}
 			mutex_exit(&msp->ms_lock);
 			msp = next_msp;
 		}
 	}
 }
 
 static int
 metaslab_compare(const void *x1, const void *x2)
 {
 	const metaslab_t *m1 = (const metaslab_t *)x1;
 	const metaslab_t *m2 = (const metaslab_t *)x2;
 
 	int sort1 = 0;
 	int sort2 = 0;
 	if (m1->ms_allocator != -1 && m1->ms_primary)
 		sort1 = 1;
 	else if (m1->ms_allocator != -1 && !m1->ms_primary)
 		sort1 = 2;
 	if (m2->ms_allocator != -1 && m2->ms_primary)
 		sort2 = 1;
 	else if (m2->ms_allocator != -1 && !m2->ms_primary)
 		sort2 = 2;
 
 	/*
 	 * Sort inactive metaslabs first, then primaries, then secondaries. When
 	 * selecting a metaslab to allocate from, an allocator first tries its
 	 * primary, then secondary active metaslab. If it doesn't have active
 	 * metaslabs, or can't allocate from them, it searches for an inactive
 	 * metaslab to activate. If it can't find a suitable one, it will steal
 	 * a primary or secondary metaslab from another allocator.
 	 */
 	if (sort1 < sort2)
 		return (-1);
 	if (sort1 > sort2)
 		return (1);
 
 	int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight);
 	if (likely(cmp))
 		return (cmp);
 
 	IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
 
 	return (TREE_CMP(m1->ms_start, m2->ms_start));
 }
 
 /*
  * ==========================================================================
  * Metaslab groups
  * ==========================================================================
  */
 /*
  * Update the allocatable flag and the metaslab group's capacity.
  * The allocatable flag is set to true if the capacity is below
  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
  * transitions from allocatable to non-allocatable or vice versa then the
  * metaslab group's class is updated to reflect the transition.
  */
 static void
 metaslab_group_alloc_update(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	metaslab_class_t *mc = mg->mg_class;
 	vdev_stat_t *vs = &vd->vdev_stat;
 	boolean_t was_allocatable;
 	boolean_t was_initialized;
 
 	ASSERT(vd == vd->vdev_top);
 	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
 	    SCL_ALLOC);
 
 	mutex_enter(&mg->mg_lock);
 	was_allocatable = mg->mg_allocatable;
 	was_initialized = mg->mg_initialized;
 
 	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 	    (vs->vs_space + 1);
 
 	mutex_enter(&mc->mc_lock);
 
 	/*
 	 * If the metaslab group was just added then it won't
 	 * have any space until we finish syncing out this txg.
 	 * At that point we will consider it initialized and available
 	 * for allocations.  We also don't consider non-activated
 	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
 	 * to be initialized, because they can't be used for allocation.
 	 */
 	mg->mg_initialized = metaslab_group_initialized(mg);
 	if (!was_initialized && mg->mg_initialized) {
 		mc->mc_groups++;
 	} else if (was_initialized && !mg->mg_initialized) {
 		ASSERT3U(mc->mc_groups, >, 0);
 		mc->mc_groups--;
 	}
 	if (mg->mg_initialized)
 		mg->mg_no_free_space = B_FALSE;
 
 	/*
 	 * A metaslab group is considered allocatable if it has plenty
 	 * of free space or is not heavily fragmented. We only take
 	 * fragmentation into account if the metaslab group has a valid
 	 * fragmentation metric (i.e. a value between 0 and 100).
 	 */
 	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
 	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 
 	/*
 	 * The mc_alloc_groups maintains a count of the number of
 	 * groups in this metaslab class that are still above the
 	 * zfs_mg_noalloc_threshold. This is used by the allocating
 	 * threads to determine if they should avoid allocations to
 	 * a given group. The allocator will avoid allocations to a group
 	 * if that group has reached or is below the zfs_mg_noalloc_threshold
 	 * and there are still other groups that are above the threshold.
 	 * When a group transitions from allocatable to non-allocatable or
 	 * vice versa we update the metaslab class to reflect that change.
 	 * When the mc_alloc_groups value drops to 0 that means that all
 	 * groups have reached the zfs_mg_noalloc_threshold making all groups
 	 * eligible for allocations. This effectively means that all devices
 	 * are balanced again.
 	 */
 	if (was_allocatable && !mg->mg_allocatable)
 		mc->mc_alloc_groups--;
 	else if (!was_allocatable && mg->mg_allocatable)
 		mc->mc_alloc_groups++;
 	mutex_exit(&mc->mc_lock);
 
 	mutex_exit(&mg->mg_lock);
 }
 
 int
 metaslab_sort_by_flushed(const void *va, const void *vb)
 {
 	const metaslab_t *a = va;
 	const metaslab_t *b = vb;
 
 	int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
 	if (likely(cmp))
 		return (cmp);
 
 	uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
 	uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
 	cmp = TREE_CMP(a_vdev_id, b_vdev_id);
 	if (cmp)
 		return (cmp);
 
 	return (TREE_CMP(a->ms_id, b->ms_id));
 }
 
 metaslab_group_t *
 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 {
 	metaslab_group_t *mg;
 
 	mg = kmem_zalloc(offsetof(metaslab_group_t,
 	    mg_allocator[allocators]), KM_SLEEP);
 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 	    sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
 	mg->mg_vd = vd;
 	mg->mg_class = mc;
 	mg->mg_activation_count = 0;
 	mg->mg_initialized = B_FALSE;
 	mg->mg_no_free_space = B_TRUE;
 	mg->mg_allocators = allocators;
 
 	for (int i = 0; i < allocators; i++) {
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 		zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
 	}
 
 	return (mg);
 }
 
 void
 metaslab_group_destroy(metaslab_group_t *mg)
 {
 	ASSERT(mg->mg_prev == NULL);
 	ASSERT(mg->mg_next == NULL);
 	/*
 	 * We may have gone below zero with the activation count
 	 * either because we never activated in the first place or
 	 * because we're done, and possibly removing the vdev.
 	 */
 	ASSERT(mg->mg_activation_count <= 0);
 
 	avl_destroy(&mg->mg_metaslab_tree);
 	mutex_destroy(&mg->mg_lock);
 	mutex_destroy(&mg->mg_ms_disabled_lock);
 	cv_destroy(&mg->mg_ms_disabled_cv);
 
 	for (int i = 0; i < mg->mg_allocators; i++) {
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 		zfs_refcount_destroy(&mga->mga_alloc_queue_depth);
 	}
 	kmem_free(mg, offsetof(metaslab_group_t,
 	    mg_allocator[mg->mg_allocators]));
 }
 
 void
 metaslab_group_activate(metaslab_group_t *mg)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	spa_t *spa = mc->mc_spa;
 	metaslab_group_t *mgprev, *mgnext;
 
 	ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0);
 
 	ASSERT(mg->mg_prev == NULL);
 	ASSERT(mg->mg_next == NULL);
 	ASSERT(mg->mg_activation_count <= 0);
 
 	if (++mg->mg_activation_count <= 0)
 		return;
 
 	mg->mg_aliquot = metaslab_aliquot * MAX(1,
 	    vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd));
 	metaslab_group_alloc_update(mg);
 
 	if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) {
 		mg->mg_prev = mg;
 		mg->mg_next = mg;
 	} else {
 		mgnext = mgprev->mg_next;
 		mg->mg_prev = mgprev;
 		mg->mg_next = mgnext;
 		mgprev->mg_next = mg;
 		mgnext->mg_prev = mg;
 	}
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		mc->mc_allocator[i].mca_rotor = mg;
 		mg = mg->mg_next;
 	}
 }
 
 /*
  * Passivate a metaslab group and remove it from the allocation rotor.
  * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
  * a metaslab group. This function will momentarily drop spa_config_locks
  * that are lower than the SCL_ALLOC lock (see comment below).
  */
 void
 metaslab_group_passivate(metaslab_group_t *mg)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	spa_t *spa = mc->mc_spa;
 	metaslab_group_t *mgprev, *mgnext;
 	int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
 
 	ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
 	    (SCL_ALLOC | SCL_ZIO));
 
 	if (--mg->mg_activation_count != 0) {
 		for (int i = 0; i < spa->spa_alloc_count; i++)
 			ASSERT(mc->mc_allocator[i].mca_rotor != mg);
 		ASSERT(mg->mg_prev == NULL);
 		ASSERT(mg->mg_next == NULL);
 		ASSERT(mg->mg_activation_count < 0);
 		return;
 	}
 
 	/*
 	 * The spa_config_lock is an array of rwlocks, ordered as
 	 * follows (from highest to lowest):
 	 *	SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
 	 *	SCL_ZIO > SCL_FREE > SCL_VDEV
 	 * (For more information about the spa_config_lock see spa_misc.c)
 	 * The higher the lock, the broader its coverage. When we passivate
 	 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
 	 * config locks. However, the metaslab group's taskq might be trying
 	 * to preload metaslabs so we must drop the SCL_ZIO lock and any
 	 * lower locks to allow the I/O to complete. At a minimum,
 	 * we continue to hold the SCL_ALLOC lock, which prevents any future
 	 * allocations from taking place and any changes to the vdev tree.
 	 */
 	spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
 	taskq_wait_outstanding(spa->spa_metaslab_taskq, 0);
 	spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 	metaslab_group_alloc_update(mg);
 	for (int i = 0; i < mg->mg_allocators; i++) {
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 		metaslab_t *msp = mga->mga_primary;
 		if (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 			metaslab_passivate(msp,
 			    metaslab_weight_from_range_tree(msp));
 			mutex_exit(&msp->ms_lock);
 		}
 		msp = mga->mga_secondary;
 		if (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 			metaslab_passivate(msp,
 			    metaslab_weight_from_range_tree(msp));
 			mutex_exit(&msp->ms_lock);
 		}
 	}
 
 	mgprev = mg->mg_prev;
 	mgnext = mg->mg_next;
 
 	if (mg == mgnext) {
 		mgnext = NULL;
 	} else {
 		mgprev->mg_next = mgnext;
 		mgnext->mg_prev = mgprev;
 	}
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		if (mc->mc_allocator[i].mca_rotor == mg)
 			mc->mc_allocator[i].mca_rotor = mgnext;
 	}
 
 	mg->mg_prev = NULL;
 	mg->mg_next = NULL;
 }
 
 boolean_t
 metaslab_group_initialized(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	vdev_stat_t *vs = &vd->vdev_stat;
 
 	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
 }
 
 uint64_t
 metaslab_group_get_space(metaslab_group_t *mg)
 {
 	/*
 	 * Note that the number of nodes in mg_metaslab_tree may be one less
 	 * than vdev_ms_count, due to the embedded log metaslab.
 	 */
 	mutex_enter(&mg->mg_lock);
 	uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree);
 	mutex_exit(&mg->mg_lock);
 	return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count);
 }
 
 void
 metaslab_group_histogram_verify(metaslab_group_t *mg)
 {
 	uint64_t *mg_hist;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	uint64_t ashift = mg->mg_vd->vdev_ashift;
 
 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 		return;
 
-	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
+	mg_hist = kmem_zalloc(sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE,
 	    KM_SLEEP);
 
-	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
+	ASSERT3U(ZFS_RANGE_TREE_HISTOGRAM_SIZE, >=,
 	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
 
 	mutex_enter(&mg->mg_lock);
 	for (metaslab_t *msp = avl_first(t);
 	    msp != NULL; msp = AVL_NEXT(t, msp)) {
 		VERIFY3P(msp->ms_group, ==, mg);
 		/* skip if not active */
 		if (msp->ms_sm == NULL)
 			continue;
 
 		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 			mg_hist[i + ashift] +=
 			    msp->ms_sm->sm_phys->smp_histogram[i];
 		}
 	}
 
-	for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
+	for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i ++)
 		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 
 	mutex_exit(&mg->mg_lock);
 
-	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
+	kmem_free(mg_hist, sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE);
 }
 
 static void
 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	uint64_t ashift = mg->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	if (msp->ms_sm == NULL)
 		return;
 
 	mutex_enter(&mg->mg_lock);
 	mutex_enter(&mc->mc_lock);
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		IMPLY(mg == mg->mg_vd->vdev_log_mg,
 		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 		mg->mg_histogram[i + ashift] +=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 		mc->mc_histogram[i + ashift] +=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 	}
 	mutex_exit(&mc->mc_lock);
 	mutex_exit(&mg->mg_lock);
 }
 
 void
 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	uint64_t ashift = mg->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	if (msp->ms_sm == NULL)
 		return;
 
 	mutex_enter(&mg->mg_lock);
 	mutex_enter(&mc->mc_lock);
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		ASSERT3U(mg->mg_histogram[i + ashift], >=,
 		    msp->ms_sm->sm_phys->smp_histogram[i]);
 		ASSERT3U(mc->mc_histogram[i + ashift], >=,
 		    msp->ms_sm->sm_phys->smp_histogram[i]);
 		IMPLY(mg == mg->mg_vd->vdev_log_mg,
 		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 
 		mg->mg_histogram[i + ashift] -=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 		mc->mc_histogram[i + ashift] -=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 	}
 	mutex_exit(&mc->mc_lock);
 	mutex_exit(&mg->mg_lock);
 }
 
 static void
 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 {
 	ASSERT(msp->ms_group == NULL);
 	mutex_enter(&mg->mg_lock);
 	msp->ms_group = mg;
 	msp->ms_weight = 0;
 	avl_add(&mg->mg_metaslab_tree, msp);
 	mutex_exit(&mg->mg_lock);
 
 	mutex_enter(&msp->ms_lock);
 	metaslab_group_histogram_add(mg, msp);
 	mutex_exit(&msp->ms_lock);
 }
 
 static void
 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 {
 	mutex_enter(&msp->ms_lock);
 	metaslab_group_histogram_remove(mg, msp);
 	mutex_exit(&msp->ms_lock);
 
 	mutex_enter(&mg->mg_lock);
 	ASSERT(msp->ms_group == mg);
 	avl_remove(&mg->mg_metaslab_tree, msp);
 
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 	multilist_sublist_t *mls =
 	    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 	if (multilist_link_active(&msp->ms_class_txg_node))
 		multilist_sublist_remove(mls, msp);
 	multilist_sublist_unlock(mls);
 
 	msp->ms_group = NULL;
 	mutex_exit(&mg->mg_lock);
 }
 
 static void
 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(MUTEX_HELD(&mg->mg_lock));
 	ASSERT(msp->ms_group == mg);
 
 	avl_remove(&mg->mg_metaslab_tree, msp);
 	msp->ms_weight = weight;
 	avl_add(&mg->mg_metaslab_tree, msp);
 
 }
 
 static void
 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 {
 	/*
 	 * Although in principle the weight can be any value, in
 	 * practice we do not use values in the range [1, 511].
 	 */
 	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	mutex_enter(&mg->mg_lock);
 	metaslab_group_sort_impl(mg, msp, weight);
 	mutex_exit(&mg->mg_lock);
 }
 
 /*
  * Calculate the fragmentation for a given metaslab group. We can use
  * a simple average here since all metaslabs within the group must have
  * the same size. The return value will be a value between 0 and 100
  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
  * group have a fragmentation metric.
  */
 uint64_t
 metaslab_group_fragmentation(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	uint64_t fragmentation = 0;
 	uint64_t valid_ms = 0;
 
 	for (int m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *msp = vd->vdev_ms[m];
 
 		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
 			continue;
 		if (msp->ms_group != mg)
 			continue;
 
 		valid_ms++;
 		fragmentation += msp->ms_fragmentation;
 	}
 
 	if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
 		return (ZFS_FRAG_INVALID);
 
 	fragmentation /= valid_ms;
 	ASSERT3U(fragmentation, <=, 100);
 	return (fragmentation);
 }
 
 /*
  * Determine if a given metaslab group should skip allocations. A metaslab
  * group should avoid allocations if its free capacity is less than the
  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
  * that can still handle allocations. If the allocation throttle is enabled
  * then we skip allocations to devices that have reached their maximum
  * allocation queue depth unless the selected metaslab group is the only
  * eligible group remaining.
  */
 static boolean_t
 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
     int flags, uint64_t psize, int allocator, int d)
 {
 	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_class_t *mc = mg->mg_class;
 
 	/*
 	 * We can only consider skipping this metaslab group if it's
 	 * in the normal metaslab class and there are other metaslab
 	 * groups to select from. Otherwise, we always consider it eligible
 	 * for allocations.
 	 */
 	if ((mc != spa_normal_class(spa) &&
 	    mc != spa_special_class(spa) &&
 	    mc != spa_dedup_class(spa)) ||
 	    mc->mc_groups <= 1)
 		return (B_TRUE);
 
 	/*
 	 * If the metaslab group's mg_allocatable flag is set (see comments
 	 * in metaslab_group_alloc_update() for more information) and
 	 * the allocation throttle is disabled then allow allocations to this
 	 * device. However, if the allocation throttle is enabled then
 	 * check if we have reached our allocation limit (mga_alloc_queue_depth)
 	 * to determine if we should allow allocations to this metaslab group.
 	 * If all metaslab groups are no longer considered allocatable
 	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
 	 * gang block size then we allow allocations on this metaslab group
 	 * regardless of the mg_allocatable or throttle settings.
 	 */
 	if (mg->mg_allocatable) {
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 		int64_t qdepth;
 		uint64_t qmax = mga->mga_cur_max_alloc_queue_depth;
 
 		if (!mc->mc_alloc_throttle_enabled)
 			return (B_TRUE);
 
 		/*
 		 * If this metaslab group does not have any free space, then
 		 * there is no point in looking further.
 		 */
 		if (mg->mg_no_free_space)
 			return (B_FALSE);
 
 		/*
 		 * Some allocations (e.g., those coming from device removal
 		 * where the * allocations are not even counted in the
 		 * metaslab * allocation queues) are allowed to bypass
 		 * the throttle.
 		 */
 		if (flags & METASLAB_DONT_THROTTLE)
 			return (B_TRUE);
 
 		/*
 		 * Relax allocation throttling for ditto blocks.  Due to
 		 * random imbalances in allocation it tends to push copies
 		 * to one vdev, that looks a bit better at the moment.
 		 */
 		qmax = qmax * (4 + d) / 4;
 
 		qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth);
 
 		/*
 		 * If this metaslab group is below its qmax or it's
 		 * the only allocatable metaslab group, then attempt
 		 * to allocate from it.
 		 */
 		if (qdepth < qmax || mc->mc_alloc_groups == 1)
 			return (B_TRUE);
 		ASSERT3U(mc->mc_alloc_groups, >, 1);
 
 		/*
 		 * Since this metaslab group is at or over its qmax, we
 		 * need to determine if there are metaslab groups after this
 		 * one that might be able to handle this allocation. This is
 		 * racy since we can't hold the locks for all metaslab
 		 * groups at the same time when we make this check.
 		 */
 		for (metaslab_group_t *mgp = mg->mg_next;
 		    mgp != rotor; mgp = mgp->mg_next) {
 			metaslab_group_allocator_t *mgap =
 			    &mgp->mg_allocator[allocator];
 			qmax = mgap->mga_cur_max_alloc_queue_depth;
 			qmax = qmax * (4 + d) / 4;
 			qdepth =
 			    zfs_refcount_count(&mgap->mga_alloc_queue_depth);
 
 			/*
 			 * If there is another metaslab group that
 			 * might be able to handle the allocation, then
 			 * we return false so that we skip this group.
 			 */
 			if (qdepth < qmax && !mgp->mg_no_free_space)
 				return (B_FALSE);
 		}
 
 		/*
 		 * We didn't find another group to handle the allocation
 		 * so we can't skip this metaslab group even though
 		 * we are at or over our qmax.
 		 */
 		return (B_TRUE);
 
 	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * ==========================================================================
  * Range tree callbacks
  * ==========================================================================
  */
 
 /*
  * Comparison function for the private size-ordered tree using 32-bit
  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
  */
 __attribute__((always_inline)) inline
 static int
 metaslab_rangesize32_compare(const void *x1, const void *x2)
 {
-	const range_seg32_t *r1 = x1;
-	const range_seg32_t *r2 = x2;
+	const zfs_range_seg32_t *r1 = x1;
+	const zfs_range_seg32_t *r2 = x2;
 
 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 
 	int cmp = TREE_CMP(rs_size1, rs_size2);
 
 	return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
 }
 
 /*
  * Comparison function for the private size-ordered tree using 64-bit
  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
  */
 __attribute__((always_inline)) inline
 static int
 metaslab_rangesize64_compare(const void *x1, const void *x2)
 {
-	const range_seg64_t *r1 = x1;
-	const range_seg64_t *r2 = x2;
+	const zfs_range_seg64_t *r1 = x1;
+	const zfs_range_seg64_t *r2 = x2;
 
 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 
 	int cmp = TREE_CMP(rs_size1, rs_size2);
 
 	return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
 }
 
 typedef struct metaslab_rt_arg {
 	zfs_btree_t *mra_bt;
 	uint32_t mra_floor_shift;
 } metaslab_rt_arg_t;
 
 struct mssa_arg {
 	zfs_range_tree_t *rt;
 	metaslab_rt_arg_t *mra;
 };
 
 static void
 metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size)
 {
 	struct mssa_arg *mssap = arg;
 	zfs_range_tree_t *rt = mssap->rt;
 	metaslab_rt_arg_t *mrap = mssap->mra;
-	range_seg_max_t seg = {0};
+	zfs_range_seg_max_t seg = {0};
 	zfs_rs_set_start(&seg, rt, start);
 	zfs_rs_set_end(&seg, rt, start + size);
 	metaslab_rt_add(rt, &seg, mrap);
 }
 
 static void
 metaslab_size_tree_full_load(zfs_range_tree_t *rt)
 {
 	metaslab_rt_arg_t *mrap = rt->rt_arg;
 	METASLABSTAT_BUMP(metaslabstat_reload_tree);
 	ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
 	mrap->mra_floor_shift = 0;
 	struct mssa_arg arg = {0};
 	arg.rt = rt;
 	arg.mra = mrap;
 	zfs_range_tree_walk(rt, metaslab_size_sorted_add, &arg);
 }
 
 
 ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf,
-    range_seg32_t, metaslab_rangesize32_compare)
+    zfs_range_seg32_t, metaslab_rangesize32_compare)
 
 ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf,
-    range_seg64_t, metaslab_rangesize64_compare)
+    zfs_range_seg64_t, metaslab_rangesize64_compare)
 
 /*
  * Create any block allocator specific components. The current allocators
  * rely on using both a size-ordered zfs_range_tree_t and an array of
  * uint64_t's.
  */
 static void
 metaslab_rt_create(zfs_range_tree_t *rt, void *arg)
 {
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
 	size_t size;
 	int (*compare) (const void *, const void *);
 	bt_find_in_buf_f bt_find;
 	switch (rt->rt_type) {
 	case ZFS_RANGE_SEG32:
-		size = sizeof (range_seg32_t);
+		size = sizeof (zfs_range_seg32_t);
 		compare = metaslab_rangesize32_compare;
 		bt_find = metaslab_rt_find_rangesize32_in_buf;
 		break;
 	case ZFS_RANGE_SEG64:
-		size = sizeof (range_seg64_t);
+		size = sizeof (zfs_range_seg64_t);
 		compare = metaslab_rangesize64_compare;
 		bt_find = metaslab_rt_find_rangesize64_in_buf;
 		break;
 	default:
 		panic("Invalid range seg type %d", rt->rt_type);
 	}
 	zfs_btree_create(size_tree, compare, bt_find, size);
 	mrap->mra_floor_shift = metaslab_by_size_min_shift;
 }
 
 static void
 metaslab_rt_destroy(zfs_range_tree_t *rt, void *arg)
 {
 	(void) rt;
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
 	zfs_btree_destroy(size_tree);
 	kmem_free(mrap, sizeof (*mrap));
 }
 
 static void
 metaslab_rt_add(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg)
 {
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
 	if (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt) <
 	    (1ULL << mrap->mra_floor_shift))
 		return;
 
 	zfs_btree_add(size_tree, rs);
 }
 
 static void
 metaslab_rt_remove(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg)
 {
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
 	if (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt) < (1ULL <<
 	    mrap->mra_floor_shift))
 		return;
 
 	zfs_btree_remove(size_tree, rs);
 }
 
 static void
 metaslab_rt_vacate(zfs_range_tree_t *rt, void *arg)
 {
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 	zfs_btree_clear(size_tree);
 	zfs_btree_destroy(size_tree);
 
 	metaslab_rt_create(rt, arg);
 }
 
 static const zfs_range_tree_ops_t metaslab_rt_ops = {
 	.rtop_create = metaslab_rt_create,
 	.rtop_destroy = metaslab_rt_destroy,
 	.rtop_add = metaslab_rt_add,
 	.rtop_remove = metaslab_rt_remove,
 	.rtop_vacate = metaslab_rt_vacate
 };
 
 /*
  * ==========================================================================
  * Common allocator routines
  * ==========================================================================
  */
 
 /*
  * Return the maximum contiguous segment within the metaslab.
  */
 uint64_t
 metaslab_largest_allocatable(metaslab_t *msp)
 {
 	zfs_btree_t *t = &msp->ms_allocatable_by_size;
 	zfs_range_seg_t *rs;
 
 	if (t == NULL)
 		return (0);
 	if (zfs_btree_numnodes(t) == 0)
 		metaslab_size_tree_full_load(msp->ms_allocatable);
 
 	rs = zfs_btree_last(t, NULL);
 	if (rs == NULL)
 		return (0);
 
 	return (zfs_rs_get_end(rs, msp->ms_allocatable) - zfs_rs_get_start(rs,
 	    msp->ms_allocatable));
 }
 
 /*
  * Return the maximum contiguous segment within the unflushed frees of this
  * metaslab.
  */
 static uint64_t
 metaslab_largest_unflushed_free(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if (msp->ms_unflushed_frees == NULL)
 		return (0);
 
 	if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0)
 		metaslab_size_tree_full_load(msp->ms_unflushed_frees);
 	zfs_range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size,
 	    NULL);
 	if (rs == NULL)
 		return (0);
 
 	/*
 	 * When a range is freed from the metaslab, that range is added to
 	 * both the unflushed frees and the deferred frees. While the block
 	 * will eventually be usable, if the metaslab were loaded the range
 	 * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
 	 * txgs had passed.  As a result, when attempting to estimate an upper
 	 * bound for the largest currently-usable free segment in the
 	 * metaslab, we need to not consider any ranges currently in the defer
 	 * trees. This algorithm approximates the largest available chunk in
 	 * the largest range in the unflushed_frees tree by taking the first
 	 * chunk.  While this may be a poor estimate, it should only remain so
 	 * briefly and should eventually self-correct as frees are no longer
 	 * deferred. Similar logic applies to the ms_freed tree. See
 	 * metaslab_load() for more details.
 	 *
 	 * There are two primary sources of inaccuracy in this estimate. Both
 	 * are tolerated for performance reasons. The first source is that we
 	 * only check the largest segment for overlaps. Smaller segments may
 	 * have more favorable overlaps with the other trees, resulting in
 	 * larger usable chunks.  Second, we only look at the first chunk in
 	 * the largest segment; there may be other usable chunks in the
 	 * largest segment, but we ignore them.
 	 */
 	uint64_t rstart = zfs_rs_get_start(rs, msp->ms_unflushed_frees);
 	uint64_t rsize = zfs_rs_get_end(rs, msp->ms_unflushed_frees) - rstart;
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		uint64_t start = 0;
 		uint64_t size = 0;
 		boolean_t found = zfs_range_tree_find_in(msp->ms_defer[t],
 		    rstart, rsize, &start, &size);
 		if (found) {
 			if (rstart == start)
 				return (0);
 			rsize = start - rstart;
 		}
 	}
 
 	uint64_t start = 0;
 	uint64_t size = 0;
 	boolean_t found = zfs_range_tree_find_in(msp->ms_freed, rstart,
 	    rsize, &start, &size);
 	if (found)
 		rsize = start - rstart;
 
 	return (rsize);
 }
 
 static zfs_range_seg_t *
 metaslab_block_find(zfs_btree_t *t, zfs_range_tree_t *rt, uint64_t start,
     uint64_t size, zfs_btree_index_t *where)
 {
 	zfs_range_seg_t *rs;
-	range_seg_max_t rsearch;
+	zfs_range_seg_max_t rsearch;
 
 	zfs_rs_set_start(&rsearch, rt, start);
 	zfs_rs_set_end(&rsearch, rt, start + size);
 
 	rs = zfs_btree_find(t, &rsearch, where);
 	if (rs == NULL) {
 		rs = zfs_btree_next(t, where, where);
 	}
 
 	return (rs);
 }
 
 /*
  * This is a helper function that can be used by the allocator to find a
  * suitable block to allocate. This will search the specified B-tree looking
  * for a block that matches the specified criteria.
  */
 static uint64_t
 metaslab_block_picker(zfs_range_tree_t *rt, uint64_t *cursor, uint64_t size,
     uint64_t max_search)
 {
 	if (*cursor == 0)
 		*cursor = rt->rt_start;
 	zfs_btree_t *bt = &rt->rt_root;
 	zfs_btree_index_t where;
 	zfs_range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size,
 	    &where);
 	uint64_t first_found;
 	int count_searched = 0;
 
 	if (rs != NULL)
 		first_found = zfs_rs_get_start(rs, rt);
 
 	while (rs != NULL && (zfs_rs_get_start(rs, rt) - first_found <=
 	    max_search || count_searched < metaslab_min_search_count)) {
 		uint64_t offset = zfs_rs_get_start(rs, rt);
 		if (offset + size <= zfs_rs_get_end(rs, rt)) {
 			*cursor = offset + size;
 			return (offset);
 		}
 		rs = zfs_btree_next(bt, &where, &where);
 		count_searched++;
 	}
 
 	*cursor = 0;
 	return (-1ULL);
 }
 
 static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size);
 static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size);
 static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size);
 metaslab_ops_t *metaslab_allocator(spa_t *spa);
 
 static metaslab_ops_t metaslab_allocators[] = {
 	{ "dynamic", metaslab_df_alloc },
 	{ "cursor", metaslab_cf_alloc },
 	{ "new-dynamic", metaslab_ndf_alloc },
 };
 
 static int
 spa_find_allocator_byname(const char *val)
 {
 	int a = ARRAY_SIZE(metaslab_allocators) - 1;
 	if (strcmp("new-dynamic", val) == 0)
 		return (-1); /* remove when ndf is working */
 	for (; a >= 0; a--) {
 		if (strcmp(val, metaslab_allocators[a].msop_name) == 0)
 			return (a);
 	}
 	return (-1);
 }
 
 void
 spa_set_allocator(spa_t *spa, const char *allocator)
 {
 	int a = spa_find_allocator_byname(allocator);
 	if (a < 0) a = 0;
 	spa->spa_active_allocator = a;
 	zfs_dbgmsg("spa allocator: %s", metaslab_allocators[a].msop_name);
 }
 
 int
 spa_get_allocator(spa_t *spa)
 {
 	return (spa->spa_active_allocator);
 }
 
 #if defined(_KERNEL)
 int
 param_set_active_allocator_common(const char *val)
 {
 	char *p;
 
 	if (val == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((p = strchr(val, '\n')) != NULL)
 		*p = '\0';
 
 	int a = spa_find_allocator_byname(val);
 	if (a < 0)
 		return (SET_ERROR(EINVAL));
 
 	zfs_active_allocator = metaslab_allocators[a].msop_name;
 	return (0);
 }
 #endif
 
 metaslab_ops_t *
 metaslab_allocator(spa_t *spa)
 {
 	int allocator = spa_get_allocator(spa);
 	return (&metaslab_allocators[allocator]);
 }
 
 /*
  * ==========================================================================
  * Dynamic Fit (df) block allocator
  *
  * Search for a free chunk of at least this size, starting from the last
  * offset (for this alignment of block) looking for up to
  * metaslab_df_max_search bytes (16MB).  If a large enough free chunk is not
  * found within 16MB, then return a free chunk of exactly the requested size (or
  * larger).
  *
  * If it seems like searching from the last offset will be unproductive, skip
  * that and just return a free chunk of exactly the requested size (or larger).
  * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct.  This
  * mechanism is probably not very useful and may be removed in the future.
  *
  * The behavior when not searching can be changed to return the largest free
  * chunk, instead of a free chunk of exactly the requested size, by setting
  * metaslab_df_use_largest_segment.
  * ==========================================================================
  */
 static uint64_t
 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 {
 	/*
 	 * Find the largest power of 2 block size that evenly divides the
 	 * requested size. This is used to try to allocate blocks with similar
 	 * alignment from the same area of the metaslab (i.e. same cursor
 	 * bucket) but it does not guarantee that other allocations sizes
 	 * may exist in the same region.
 	 */
 	uint64_t align = size & -size;
 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
 	zfs_range_tree_t *rt = msp->ms_allocatable;
 	uint_t free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size;
 	uint64_t offset;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * If we're running low on space, find a segment based on size,
 	 * rather than iterating based on offset.
 	 */
 	if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
 	    free_pct < metaslab_df_free_pct) {
 		offset = -1;
 	} else {
 		offset = metaslab_block_picker(rt,
 		    cursor, size, metaslab_df_max_search);
 	}
 
 	if (offset == -1) {
 		zfs_range_seg_t *rs;
 		if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
 			metaslab_size_tree_full_load(msp->ms_allocatable);
 
 		if (metaslab_df_use_largest_segment) {
 			/* use largest free segment */
 			rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
 		} else {
 			zfs_btree_index_t where;
 			/* use segment of this size, or next largest */
 			rs = metaslab_block_find(&msp->ms_allocatable_by_size,
 			    rt, msp->ms_start, size, &where);
 		}
 		if (rs != NULL && zfs_rs_get_start(rs, rt) + size <=
 		    zfs_rs_get_end(rs, rt)) {
 			offset = zfs_rs_get_start(rs, rt);
 			*cursor = offset + size;
 		}
 	}
 
 	return (offset);
 }
 
 /*
  * ==========================================================================
  * Cursor fit block allocator -
  * Select the largest region in the metaslab, set the cursor to the beginning
  * of the range and the cursor_end to the end of the range. As allocations
  * are made advance the cursor. Continue allocating from the cursor until
  * the range is exhausted and then find a new range.
  * ==========================================================================
  */
 static uint64_t
 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 {
 	zfs_range_tree_t *rt = msp->ms_allocatable;
 	zfs_btree_t *t = &msp->ms_allocatable_by_size;
 	uint64_t *cursor = &msp->ms_lbas[0];
 	uint64_t *cursor_end = &msp->ms_lbas[1];
 	uint64_t offset = 0;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	ASSERT3U(*cursor_end, >=, *cursor);
 
 	if ((*cursor + size) > *cursor_end) {
 		zfs_range_seg_t *rs;
 
 		if (zfs_btree_numnodes(t) == 0)
 			metaslab_size_tree_full_load(msp->ms_allocatable);
 		rs = zfs_btree_last(t, NULL);
 		if (rs == NULL || (zfs_rs_get_end(rs, rt) -
 		    zfs_rs_get_start(rs, rt)) < size)
 			return (-1ULL);
 
 		*cursor = zfs_rs_get_start(rs, rt);
 		*cursor_end = zfs_rs_get_end(rs, rt);
 	}
 
 	offset = *cursor;
 	*cursor += size;
 
 	return (offset);
 }
 
 /*
  * ==========================================================================
  * New dynamic fit allocator -
  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
  * contiguous blocks. If no region is found then just use the largest segment
  * that remains.
  * ==========================================================================
  */
 
 /*
  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
  * to request from the allocator.
  */
 uint64_t metaslab_ndf_clump_shift = 4;
 
 static uint64_t
 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 {
 	zfs_btree_t *t = &msp->ms_allocatable->rt_root;
 	zfs_range_tree_t *rt = msp->ms_allocatable;
 	zfs_btree_index_t where;
 	zfs_range_seg_t *rs;
-	range_seg_max_t rsearch;
+	zfs_range_seg_max_t rsearch;
 	uint64_t hbit = highbit64(size);
 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
 	uint64_t max_size = metaslab_largest_allocatable(msp);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if (max_size < size)
 		return (-1ULL);
 
 	zfs_rs_set_start(&rsearch, rt, *cursor);
 	zfs_rs_set_end(&rsearch, rt, *cursor + size);
 
 	rs = zfs_btree_find(t, &rsearch, &where);
 	if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) <
 	    size) {
 		t = &msp->ms_allocatable_by_size;
 
 		zfs_rs_set_start(&rsearch, rt, 0);
 		zfs_rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit +
 		    metaslab_ndf_clump_shift)));
 
 		rs = zfs_btree_find(t, &rsearch, &where);
 		if (rs == NULL)
 			rs = zfs_btree_next(t, &where, &where);
 		ASSERT(rs != NULL);
 	}
 
 	if ((zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) >= size) {
 		*cursor = zfs_rs_get_start(rs, rt) + size;
 		return (zfs_rs_get_start(rs, rt));
 	}
 	return (-1ULL);
 }
 
 /*
  * ==========================================================================
  * Metaslabs
  * ==========================================================================
  */
 
 /*
  * Wait for any in-progress metaslab loads to complete.
  */
 static void
 metaslab_load_wait(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	while (msp->ms_loading) {
 		ASSERT(!msp->ms_loaded);
 		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
 	}
 }
 
 /*
  * Wait for any in-progress flushing to complete.
  */
 static void
 metaslab_flush_wait(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	while (msp->ms_flushing)
 		cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
 }
 
 static unsigned int
 metaslab_idx_func(multilist_t *ml, void *arg)
 {
 	metaslab_t *msp = arg;
 
 	/*
 	 * ms_id values are allocated sequentially, so full 64bit
 	 * division would be a waste of time, so limit it to 32 bits.
 	 */
 	return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml));
 }
 
 uint64_t
 metaslab_allocated_space(metaslab_t *msp)
 {
 	return (msp->ms_allocated_space);
 }
 
 /*
  * Verify that the space accounting on disk matches the in-core range_trees.
  */
 static void
 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	uint64_t allocating = 0;
 	uint64_t sm_free_space, msp_free_space;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(!msp->ms_condensing);
 
 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 		return;
 
 	/*
 	 * We can only verify the metaslab space when we're called
 	 * from syncing context with a loaded metaslab that has an
 	 * allocated space map. Calling this in non-syncing context
 	 * does not provide a consistent view of the metaslab since
 	 * we're performing allocations in the future.
 	 */
 	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 	    !msp->ms_loaded)
 		return;
 
 	/*
 	 * Even though the smp_alloc field can get negative,
 	 * when it comes to a metaslab's space map, that should
 	 * never be the case.
 	 */
 	ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
 
 	ASSERT3U(space_map_allocated(msp->ms_sm), >=,
 	    zfs_range_tree_space(msp->ms_unflushed_frees));
 
 	ASSERT3U(metaslab_allocated_space(msp), ==,
 	    space_map_allocated(msp->ms_sm) +
 	    zfs_range_tree_space(msp->ms_unflushed_allocs) -
 	    zfs_range_tree_space(msp->ms_unflushed_frees));
 
 	sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
 
 	/*
 	 * Account for future allocations since we would have
 	 * already deducted that space from the ms_allocatable.
 	 */
 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 		allocating +=
 		    zfs_range_tree_space(msp->ms_allocating[(txg + t) &
 		    TXG_MASK]);
 	}
 	ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
 	    msp->ms_allocating_total);
 
 	ASSERT3U(msp->ms_deferspace, ==,
 	    zfs_range_tree_space(msp->ms_defer[0]) +
 	    zfs_range_tree_space(msp->ms_defer[1]));
 
 	msp_free_space = zfs_range_tree_space(msp->ms_allocatable) +
 	    allocating + msp->ms_deferspace +
 	    zfs_range_tree_space(msp->ms_freed);
 
 	VERIFY3U(sm_free_space, ==, msp_free_space);
 }
 
 static void
 metaslab_aux_histograms_clear(metaslab_t *msp)
 {
 	/*
 	 * Auxiliary histograms are only cleared when resetting them,
 	 * which can only happen while the metaslab is loaded.
 	 */
 	ASSERT(msp->ms_loaded);
 
 	memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
 	for (int t = 0; t < TXG_DEFER_SIZE; t++)
 		memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t]));
 }
 
 static void
 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
     zfs_range_tree_t *rt)
 {
 	/*
 	 * This is modeled after space_map_histogram_add(), so refer to that
 	 * function for implementation details. We want this to work like
 	 * the space map histogram, and not the range tree histogram, as we
 	 * are essentially constructing a delta that will be later subtracted
 	 * from the space map histogram.
 	 */
 	int idx = 0;
-	for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+	for (int i = shift; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
 		ASSERT3U(i, >=, idx + shift);
 		histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
 
 		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
 			ASSERT3U(idx + shift, ==, i);
 			idx++;
 			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
 		}
 	}
 }
 
 /*
  * Called at every sync pass that the metaslab gets synced.
  *
  * The reason is that we want our auxiliary histograms to be updated
  * wherever the metaslab's space map histogram is updated. This way
  * we stay consistent on which parts of the metaslab space map's
  * histogram are currently not available for allocations (e.g because
  * they are in the defer, freed, and freeing trees).
  */
 static void
 metaslab_aux_histograms_update(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	ASSERT(sm != NULL);
 
 	/*
 	 * This is similar to the metaslab's space map histogram updates
 	 * that take place in metaslab_sync(). The only difference is that
 	 * we only care about segments that haven't made it into the
 	 * ms_allocatable tree yet.
 	 */
 	if (msp->ms_loaded) {
 		metaslab_aux_histograms_clear(msp);
 
 		metaslab_aux_histogram_add(msp->ms_synchist,
 		    sm->sm_shift, msp->ms_freed);
 
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			metaslab_aux_histogram_add(msp->ms_deferhist[t],
 			    sm->sm_shift, msp->ms_defer[t]);
 		}
 	}
 
 	metaslab_aux_histogram_add(msp->ms_synchist,
 	    sm->sm_shift, msp->ms_freeing);
 }
 
 /*
  * Called every time we are done syncing (writing to) the metaslab,
  * i.e. at the end of each sync pass.
  * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
  */
 static void
 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	space_map_t *sm = msp->ms_sm;
 
 	if (sm == NULL) {
 		/*
 		 * We came here from metaslab_init() when creating/opening a
 		 * pool, looking at a metaslab that hasn't had any allocations
 		 * yet.
 		 */
 		return;
 	}
 
 	/*
 	 * This is similar to the actions that we take for the ms_freed
 	 * and ms_defer trees in metaslab_sync_done().
 	 */
 	uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
 	if (defer_allowed) {
 		memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist,
 		    sizeof (msp->ms_synchist));
 	} else {
 		memset(msp->ms_deferhist[hist_index], 0,
 		    sizeof (msp->ms_deferhist[hist_index]));
 	}
 	memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
 }
 
 /*
  * Ensure that the metaslab's weight and fragmentation are consistent
  * with the contents of the histogram (either the range tree's histogram
  * or the space map's depending whether the metaslab is loaded).
  */
 static void
 metaslab_verify_weight_and_frag(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 		return;
 
 	/*
 	 * We can end up here from vdev_remove_complete(), in which case we
 	 * cannot do these assertions because we hold spa config locks and
 	 * thus we are not allowed to read from the DMU.
 	 *
 	 * We check if the metaslab group has been removed and if that's
 	 * the case we return immediately as that would mean that we are
 	 * here from the aforementioned code path.
 	 */
 	if (msp->ms_group == NULL)
 		return;
 
 	/*
 	 * Devices being removed always return a weight of 0 and leave
 	 * fragmentation and ms_max_size as is - there is nothing for
 	 * us to verify here.
 	 */
 	vdev_t *vd = msp->ms_group->mg_vd;
 	if (vd->vdev_removing)
 		return;
 
 	/*
 	 * If the metaslab is dirty it probably means that we've done
 	 * some allocations or frees that have changed our histograms
 	 * and thus the weight.
 	 */
 	for (int t = 0; t < TXG_SIZE; t++) {
 		if (txg_list_member(&vd->vdev_ms_list, msp, t))
 			return;
 	}
 
 	/*
 	 * This verification checks that our in-memory state is consistent
 	 * with what's on disk. If the pool is read-only then there aren't
 	 * any changes and we just have the initially-loaded state.
 	 */
 	if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
 		return;
 
 	/* some extra verification for in-core tree if you can */
 	if (msp->ms_loaded) {
 		zfs_range_tree_stat_verify(msp->ms_allocatable);
 		VERIFY(space_map_histogram_verify(msp->ms_sm,
 		    msp->ms_allocatable));
 	}
 
 	uint64_t weight = msp->ms_weight;
 	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 	boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
 	uint64_t frag = msp->ms_fragmentation;
 	uint64_t max_segsize = msp->ms_max_size;
 
 	msp->ms_weight = 0;
 	msp->ms_fragmentation = 0;
 
 	/*
 	 * This function is used for verification purposes and thus should
 	 * not introduce any side-effects/mutations on the system's state.
 	 *
 	 * Regardless of whether metaslab_weight() thinks this metaslab
 	 * should be active or not, we want to ensure that the actual weight
 	 * (and therefore the value of ms_weight) would be the same if it
 	 * was to be recalculated at this point.
 	 *
 	 * In addition we set the nodirty flag so metaslab_weight() does
 	 * not dirty the metaslab for future TXGs (e.g. when trying to
 	 * force condensing to upgrade the metaslab spacemaps).
 	 */
 	msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active;
 
 	VERIFY3U(max_segsize, ==, msp->ms_max_size);
 
 	/*
 	 * If the weight type changed then there is no point in doing
 	 * verification. Revert fields to their original values.
 	 */
 	if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
 	    (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
 		msp->ms_fragmentation = frag;
 		msp->ms_weight = weight;
 		return;
 	}
 
 	VERIFY3U(msp->ms_fragmentation, ==, frag);
 	VERIFY3U(msp->ms_weight, ==, weight);
 }
 
 /*
  * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
  * this class that was used longest ago, and attempt to unload it.  We don't
  * want to spend too much time in this loop to prevent performance
  * degradation, and we expect that most of the time this operation will
  * succeed. Between that and the normal unloading processing during txg sync,
  * we expect this to keep the metaslab memory usage under control.
  */
 static void
 metaslab_potentially_evict(metaslab_class_t *mc)
 {
 #ifdef _KERNEL
 	uint64_t allmem = arc_all_memory();
 	uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
 	uint64_t size =	spl_kmem_cache_entry_size(zfs_btree_leaf_cache);
 	uint_t tries = 0;
 	for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
 	    tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2;
 	    tries++) {
 		unsigned int idx = multilist_get_random_index(
 		    &mc->mc_metaslab_txg_list);
 		multilist_sublist_t *mls =
 		    multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
 		    inuse * size) {
 			VERIFY3P(mls, ==, multilist_sublist_lock_idx(
 			    &mc->mc_metaslab_txg_list, idx));
 			ASSERT3U(idx, ==,
 			    metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
 
 			if (!multilist_link_active(&msp->ms_class_txg_node)) {
 				multilist_sublist_unlock(mls);
 				break;
 			}
 			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 			multilist_sublist_unlock(mls);
 			/*
 			 * If the metaslab is currently loading there are two
 			 * cases. If it's the metaslab we're evicting, we
 			 * can't continue on or we'll panic when we attempt to
 			 * recursively lock the mutex. If it's another
 			 * metaslab that's loading, it can be safely skipped,
 			 * since we know it's very new and therefore not a
 			 * good eviction candidate. We check later once the
 			 * lock is held that the metaslab is fully loaded
 			 * before actually unloading it.
 			 */
 			if (msp->ms_loading) {
 				msp = next_msp;
 				inuse =
 				    spl_kmem_cache_inuse(zfs_btree_leaf_cache);
 				continue;
 			}
 			/*
 			 * We can't unload metaslabs with no spacemap because
 			 * they're not ready to be unloaded yet. We can't
 			 * unload metaslabs with outstanding allocations
 			 * because doing so could cause the metaslab's weight
 			 * to decrease while it's unloaded, which violates an
 			 * invariant that we use to prevent unnecessary
 			 * loading. We also don't unload metaslabs that are
 			 * currently active because they are high-weight
 			 * metaslabs that are likely to be used in the near
 			 * future.
 			 */
 			mutex_enter(&msp->ms_lock);
 			if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
 			    msp->ms_allocating_total == 0) {
 				metaslab_unload(msp);
 			}
 			mutex_exit(&msp->ms_lock);
 			msp = next_msp;
 			inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
 		}
 	}
 #else
 	(void) mc, (void) zfs_metaslab_mem_limit;
 #endif
 }
 
 static int
 metaslab_load_impl(metaslab_t *msp)
 {
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loading);
 	ASSERT(!msp->ms_condensing);
 
 	/*
 	 * We temporarily drop the lock to unblock other operations while we
 	 * are reading the space map. Therefore, metaslab_sync() and
 	 * metaslab_sync_done() can run at the same time as we do.
 	 *
 	 * If we are using the log space maps, metaslab_sync() can't write to
 	 * the metaslab's space map while we are loading as we only write to
 	 * it when we are flushing the metaslab, and that can't happen while
 	 * we are loading it.
 	 *
 	 * If we are not using log space maps though, metaslab_sync() can
 	 * append to the space map while we are loading. Therefore we load
 	 * only entries that existed when we started the load. Additionally,
 	 * metaslab_sync_done() has to wait for the load to complete because
 	 * there are potential races like metaslab_load() loading parts of the
 	 * space map that are currently being appended by metaslab_sync(). If
 	 * we didn't, the ms_allocatable would have entries that
 	 * metaslab_sync_done() would try to re-add later.
 	 *
 	 * That's why before dropping the lock we remember the synced length
 	 * of the metaslab and read up to that point of the space map,
 	 * ignoring entries appended by metaslab_sync() that happen after we
 	 * drop the lock.
 	 */
 	uint64_t length = msp->ms_synced_length;
 	mutex_exit(&msp->ms_lock);
 
 	hrtime_t load_start = gethrtime();
 	metaslab_rt_arg_t *mrap;
 	if (msp->ms_allocatable->rt_arg == NULL) {
 		mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
 	} else {
 		mrap = msp->ms_allocatable->rt_arg;
 		msp->ms_allocatable->rt_ops = NULL;
 		msp->ms_allocatable->rt_arg = NULL;
 	}
 	mrap->mra_bt = &msp->ms_allocatable_by_size;
 	mrap->mra_floor_shift = metaslab_by_size_min_shift;
 
 	if (msp->ms_sm != NULL) {
 		error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
 		    SM_FREE, length);
 
 		/* Now, populate the size-sorted tree. */
 		metaslab_rt_create(msp->ms_allocatable, mrap);
 		msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
 		msp->ms_allocatable->rt_arg = mrap;
 
 		struct mssa_arg arg = {0};
 		arg.rt = msp->ms_allocatable;
 		arg.mra = mrap;
 		zfs_range_tree_walk(msp->ms_allocatable,
 		    metaslab_size_sorted_add, &arg);
 	} else {
 		/*
 		 * Add the size-sorted tree first, since we don't need to load
 		 * the metaslab from the spacemap.
 		 */
 		metaslab_rt_create(msp->ms_allocatable, mrap);
 		msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
 		msp->ms_allocatable->rt_arg = mrap;
 		/*
 		 * The space map has not been allocated yet, so treat
 		 * all the space in the metaslab as free and add it to the
 		 * ms_allocatable tree.
 		 */
 		zfs_range_tree_add(msp->ms_allocatable,
 		    msp->ms_start, msp->ms_size);
 
 		if (msp->ms_new) {
 			/*
 			 * If the ms_sm doesn't exist, this means that this
 			 * metaslab hasn't gone through metaslab_sync() and
 			 * thus has never been dirtied. So we shouldn't
 			 * expect any unflushed allocs or frees from previous
 			 * TXGs.
 			 */
 			ASSERT(zfs_range_tree_is_empty(
 			    msp->ms_unflushed_allocs));
 			ASSERT(zfs_range_tree_is_empty(
 			    msp->ms_unflushed_frees));
 		}
 	}
 
 	/*
 	 * We need to grab the ms_sync_lock to prevent metaslab_sync() from
 	 * changing the ms_sm (or log_sm) and the metaslab's range trees
 	 * while we are about to use them and populate the ms_allocatable.
 	 * The ms_lock is insufficient for this because metaslab_sync() doesn't
 	 * hold the ms_lock while writing the ms_checkpointing tree to disk.
 	 */
 	mutex_enter(&msp->ms_sync_lock);
 	mutex_enter(&msp->ms_lock);
 
 	ASSERT(!msp->ms_condensing);
 	ASSERT(!msp->ms_flushing);
 
 	if (error != 0) {
 		mutex_exit(&msp->ms_sync_lock);
 		return (error);
 	}
 
 	ASSERT3P(msp->ms_group, !=, NULL);
 	msp->ms_loaded = B_TRUE;
 
 	/*
 	 * Apply all the unflushed changes to ms_allocatable right
 	 * away so any manipulations we do below have a clear view
 	 * of what is allocated and what is free.
 	 */
 	zfs_range_tree_walk(msp->ms_unflushed_allocs,
 	    zfs_range_tree_remove, msp->ms_allocatable);
 	zfs_range_tree_walk(msp->ms_unflushed_frees,
 	    zfs_range_tree_add, msp->ms_allocatable);
 
 	ASSERT3P(msp->ms_group, !=, NULL);
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	if (spa_syncing_log_sm(spa) != NULL) {
 		ASSERT(spa_feature_is_enabled(spa,
 		    SPA_FEATURE_LOG_SPACEMAP));
 
 		/*
 		 * If we use a log space map we add all the segments
 		 * that are in ms_unflushed_frees so they are available
 		 * for allocation.
 		 *
 		 * ms_allocatable needs to contain all free segments
 		 * that are ready for allocations (thus not segments
 		 * from ms_freeing, ms_freed, and the ms_defer trees).
 		 * But if we grab the lock in this code path at a sync
 		 * pass later that 1, then it also contains the
 		 * segments of ms_freed (they were added to it earlier
 		 * in this path through ms_unflushed_frees). So we
 		 * need to remove all the segments that exist in
 		 * ms_freed from ms_allocatable as they will be added
 		 * later in metaslab_sync_done().
 		 *
 		 * When there's no log space map, the ms_allocatable
 		 * correctly doesn't contain any segments that exist
 		 * in ms_freed [see ms_synced_length].
 		 */
 		zfs_range_tree_walk(msp->ms_freed,
 		    zfs_range_tree_remove, msp->ms_allocatable);
 	}
 
 	/*
 	 * If we are not using the log space map, ms_allocatable
 	 * contains the segments that exist in the ms_defer trees
 	 * [see ms_synced_length]. Thus we need to remove them
 	 * from ms_allocatable as they will be added again in
 	 * metaslab_sync_done().
 	 *
 	 * If we are using the log space map, ms_allocatable still
 	 * contains the segments that exist in the ms_defer trees.
 	 * Not because it read them through the ms_sm though. But
 	 * because these segments are part of ms_unflushed_frees
 	 * whose segments we add to ms_allocatable earlier in this
 	 * code path.
 	 */
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		zfs_range_tree_walk(msp->ms_defer[t],
 		    zfs_range_tree_remove, msp->ms_allocatable);
 	}
 
 	/*
 	 * Call metaslab_recalculate_weight_and_sort() now that the
 	 * metaslab is loaded so we get the metaslab's real weight.
 	 *
 	 * Unless this metaslab was created with older software and
 	 * has not yet been converted to use segment-based weight, we
 	 * expect the new weight to be better or equal to the weight
 	 * that the metaslab had while it was not loaded. This is
 	 * because the old weight does not take into account the
 	 * consolidation of adjacent segments between TXGs. [see
 	 * comment for ms_synchist and ms_deferhist[] for more info]
 	 */
 	uint64_t weight = msp->ms_weight;
 	uint64_t max_size = msp->ms_max_size;
 	metaslab_recalculate_weight_and_sort(msp);
 	if (!WEIGHT_IS_SPACEBASED(weight))
 		ASSERT3U(weight, <=, msp->ms_weight);
 	msp->ms_max_size = metaslab_largest_allocatable(msp);
 	ASSERT3U(max_size, <=, msp->ms_max_size);
 	hrtime_t load_end = gethrtime();
 	msp->ms_load_time = load_end;
 	zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, "
 	    "ms_id %llu, smp_length %llu, "
 	    "unflushed_allocs %llu, unflushed_frees %llu, "
 	    "freed %llu, defer %llu + %llu, unloaded time %llu ms, "
 	    "loading_time %lld ms, ms_max_size %llu, "
 	    "max size error %lld, "
 	    "old_weight %llx, new_weight %llx",
 	    (u_longlong_t)spa_syncing_txg(spa), spa_name(spa),
 	    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 	    (u_longlong_t)msp->ms_id,
 	    (u_longlong_t)space_map_length(msp->ms_sm),
 	    (u_longlong_t)zfs_range_tree_space(msp->ms_unflushed_allocs),
 	    (u_longlong_t)zfs_range_tree_space(msp->ms_unflushed_frees),
 	    (u_longlong_t)zfs_range_tree_space(msp->ms_freed),
 	    (u_longlong_t)zfs_range_tree_space(msp->ms_defer[0]),
 	    (u_longlong_t)zfs_range_tree_space(msp->ms_defer[1]),
 	    (longlong_t)((load_start - msp->ms_unload_time) / 1000000),
 	    (longlong_t)((load_end - load_start) / 1000000),
 	    (u_longlong_t)msp->ms_max_size,
 	    (u_longlong_t)msp->ms_max_size - max_size,
 	    (u_longlong_t)weight, (u_longlong_t)msp->ms_weight);
 
 	metaslab_verify_space(msp, spa_syncing_txg(spa));
 	mutex_exit(&msp->ms_sync_lock);
 	return (0);
 }
 
 int
 metaslab_load(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * There may be another thread loading the same metaslab, if that's
 	 * the case just wait until the other thread is done and return.
 	 */
 	metaslab_load_wait(msp);
 	if (msp->ms_loaded)
 		return (0);
 	VERIFY(!msp->ms_loading);
 	ASSERT(!msp->ms_condensing);
 
 	/*
 	 * We set the loading flag BEFORE potentially dropping the lock to
 	 * wait for an ongoing flush (see ms_flushing below). This way other
 	 * threads know that there is already a thread that is loading this
 	 * metaslab.
 	 */
 	msp->ms_loading = B_TRUE;
 
 	/*
 	 * Wait for any in-progress flushing to finish as we drop the ms_lock
 	 * both here (during space_map_load()) and in metaslab_flush() (when
 	 * we flush our changes to the ms_sm).
 	 */
 	if (msp->ms_flushing)
 		metaslab_flush_wait(msp);
 
 	/*
 	 * In the possibility that we were waiting for the metaslab to be
 	 * flushed (where we temporarily dropped the ms_lock), ensure that
 	 * no one else loaded the metaslab somehow.
 	 */
 	ASSERT(!msp->ms_loaded);
 
 	/*
 	 * If we're loading a metaslab in the normal class, consider evicting
 	 * another one to keep our memory usage under the limit defined by the
 	 * zfs_metaslab_mem_limit tunable.
 	 */
 	if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
 	    msp->ms_group->mg_class) {
 		metaslab_potentially_evict(msp->ms_group->mg_class);
 	}
 
 	int error = metaslab_load_impl(msp);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	msp->ms_loading = B_FALSE;
 	cv_broadcast(&msp->ms_load_cv);
 
 	return (error);
 }
 
 void
 metaslab_unload(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * This can happen if a metaslab is selected for eviction (in
 	 * metaslab_potentially_evict) and then unloaded during spa_sync (via
 	 * metaslab_class_evict_old).
 	 */
 	if (!msp->ms_loaded)
 		return;
 
 	zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 	msp->ms_loaded = B_FALSE;
 	msp->ms_unload_time = gethrtime();
 
 	msp->ms_activation_weight = 0;
 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
 
 	if (msp->ms_group != NULL) {
 		metaslab_class_t *mc = msp->ms_group->mg_class;
 		multilist_sublist_t *mls =
 		    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 		if (multilist_link_active(&msp->ms_class_txg_node))
 			multilist_sublist_remove(mls, msp);
 		multilist_sublist_unlock(mls);
 
 		spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 		zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, "
 		    "ms_id %llu, weight %llx, "
 		    "selected txg %llu (%llu ms ago), alloc_txg %llu, "
 		    "loaded %llu ms ago, max_size %llu",
 		    (u_longlong_t)spa_syncing_txg(spa), spa_name(spa),
 		    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 		    (u_longlong_t)msp->ms_id,
 		    (u_longlong_t)msp->ms_weight,
 		    (u_longlong_t)msp->ms_selected_txg,
 		    (u_longlong_t)(msp->ms_unload_time -
 		    msp->ms_selected_time) / 1000 / 1000,
 		    (u_longlong_t)msp->ms_alloc_txg,
 		    (u_longlong_t)(msp->ms_unload_time -
 		    msp->ms_load_time) / 1000 / 1000,
 		    (u_longlong_t)msp->ms_max_size);
 	}
 
 	/*
 	 * We explicitly recalculate the metaslab's weight based on its space
 	 * map (as it is now not loaded). We want unload metaslabs to always
 	 * have their weights calculated from the space map histograms, while
 	 * loaded ones have it calculated from their in-core range tree
 	 * [see metaslab_load()]. This way, the weight reflects the information
 	 * available in-core, whether it is loaded or not.
 	 *
 	 * If ms_group == NULL means that we came here from metaslab_fini(),
 	 * at which point it doesn't make sense for us to do the recalculation
 	 * and the sorting.
 	 */
 	if (msp->ms_group != NULL)
 		metaslab_recalculate_weight_and_sort(msp);
 }
 
 /*
  * We want to optimize the memory use of the per-metaslab range
  * trees. To do this, we store the segments in the range trees in
  * units of sectors, zero-indexing from the start of the metaslab. If
  * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
  * the ranges using two uint32_ts, rather than two uint64_ts.
  */
 zfs_range_seg_type_t
 metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp,
     uint64_t *start, uint64_t *shift)
 {
 	if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 &&
 	    !zfs_metaslab_force_large_segs) {
 		*shift = vdev->vdev_ashift;
 		*start = msp->ms_start;
 		return (ZFS_RANGE_SEG32);
 	} else {
 		*shift = 0;
 		*start = 0;
 		return (ZFS_RANGE_SEG64);
 	}
 }
 
 void
 metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 	multilist_sublist_t *mls =
 	    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 	if (multilist_link_active(&msp->ms_class_txg_node))
 		multilist_sublist_remove(mls, msp);
 	msp->ms_selected_txg = txg;
 	msp->ms_selected_time = gethrtime();
 	multilist_sublist_insert_tail(mls, msp);
 	multilist_sublist_unlock(mls);
 }
 
 void
 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
     int64_t defer_delta, int64_t space_delta)
 {
 	vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
 
 	ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
 	ASSERT(vd->vdev_ms_count != 0);
 
 	metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
 	    vdev_deflated_space(vd, space_delta));
 }
 
 int
 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
     uint64_t txg, metaslab_t **msp)
 {
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	metaslab_t *ms;
 	int error;
 
 	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
 	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
 	multilist_link_init(&ms->ms_class_txg_node);
 
 	ms->ms_id = id;
 	ms->ms_start = id << vd->vdev_ms_shift;
 	ms->ms_size = 1ULL << vd->vdev_ms_shift;
 	ms->ms_allocator = -1;
 	ms->ms_new = B_TRUE;
 
 	vdev_ops_t *ops = vd->vdev_ops;
 	if (ops->vdev_op_metaslab_init != NULL)
 		ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size);
 
 	/*
 	 * We only open space map objects that already exist. All others
 	 * will be opened when we finally allocate an object for it. For
 	 * readonly pools there is no need to open the space map object.
 	 *
 	 * Note:
 	 * When called from vdev_expand(), we can't call into the DMU as
 	 * we are holding the spa_config_lock as a writer and we would
 	 * deadlock [see relevant comment in vdev_metaslab_init()]. in
 	 * that case, the object parameter is zero though, so we won't
 	 * call into the DMU.
 	 */
 	if (object != 0 && !(spa->spa_mode == SPA_MODE_READ &&
 	    !spa->spa_read_spacemaps)) {
 		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
 		    ms->ms_size, vd->vdev_ashift);
 
 		if (error != 0) {
 			kmem_free(ms, sizeof (metaslab_t));
 			return (error);
 		}
 
 		ASSERT(ms->ms_sm != NULL);
 		ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
 	}
 
 	uint64_t shift, start;
 	zfs_range_seg_type_t type =
 	    metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
 
 	ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start,
 	    shift);
 	for (int t = 0; t < TXG_SIZE; t++) {
 		ms->ms_allocating[t] = zfs_range_tree_create(NULL, type,
 		    NULL, start, shift);
 	}
 	ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift);
 	ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift);
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL,
 		    start, shift);
 	}
 	ms->ms_checkpointing =
 	    zfs_range_tree_create(NULL, type, NULL, start, shift);
 	ms->ms_unflushed_allocs =
 	    zfs_range_tree_create(NULL, type, NULL, start, shift);
 
 	metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
 	mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
 	mrap->mra_floor_shift = metaslab_by_size_min_shift;
 	ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops,
 	    type, mrap, start, shift);
 
 	ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift);
 
 	metaslab_group_add(mg, ms);
 	metaslab_set_fragmentation(ms, B_FALSE);
 
 	/*
 	 * If we're opening an existing pool (txg == 0) or creating
 	 * a new one (txg == TXG_INITIAL), all space is available now.
 	 * If we're adding space to an existing pool, the new space
 	 * does not become available until after this txg has synced.
 	 * The metaslab's weight will also be initialized when we sync
 	 * out this txg. This ensures that we don't attempt to allocate
 	 * from it before we have initialized it completely.
 	 */
 	if (txg <= TXG_INITIAL) {
 		metaslab_sync_done(ms, 0);
 		metaslab_space_update(vd, mg->mg_class,
 		    metaslab_allocated_space(ms), 0, 0);
 	}
 
 	if (txg != 0) {
 		vdev_dirty(vd, 0, NULL, txg);
 		vdev_dirty(vd, VDD_METASLAB, ms, txg);
 	}
 
 	*msp = ms;
 
 	return (0);
 }
 
 static void
 metaslab_fini_flush_data(metaslab_t *msp)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	if (metaslab_unflushed_txg(msp) == 0) {
 		ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
 		    ==, NULL);
 		return;
 	}
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	mutex_enter(&spa->spa_flushed_ms_lock);
 	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
 	mutex_exit(&spa->spa_flushed_ms_lock);
 
 	spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
 	spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp),
 	    metaslab_unflushed_dirty(msp));
 }
 
 uint64_t
 metaslab_unflushed_changes_memused(metaslab_t *ms)
 {
 	return ((zfs_range_tree_numsegs(ms->ms_unflushed_allocs) +
 	    zfs_range_tree_numsegs(ms->ms_unflushed_frees)) *
 	    ms->ms_unflushed_allocs->rt_root.bt_elem_size);
 }
 
 void
 metaslab_fini(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 
 	metaslab_fini_flush_data(msp);
 
 	metaslab_group_remove(mg, msp);
 
 	mutex_enter(&msp->ms_lock);
 	VERIFY(msp->ms_group == NULL);
 
 	/*
 	 * If this metaslab hasn't been through metaslab_sync_done() yet its
 	 * space hasn't been accounted for in its vdev and doesn't need to be
 	 * subtracted.
 	 */
 	if (!msp->ms_new) {
 		metaslab_space_update(vd, mg->mg_class,
 		    -metaslab_allocated_space(msp), 0, -msp->ms_size);
 
 	}
 	space_map_close(msp->ms_sm);
 	msp->ms_sm = NULL;
 
 	metaslab_unload(msp);
 
 	zfs_range_tree_destroy(msp->ms_allocatable);
 	zfs_range_tree_destroy(msp->ms_freeing);
 	zfs_range_tree_destroy(msp->ms_freed);
 
 	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 	    metaslab_unflushed_changes_memused(msp));
 	spa->spa_unflushed_stats.sus_memused -=
 	    metaslab_unflushed_changes_memused(msp);
 	zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
 	zfs_range_tree_destroy(msp->ms_unflushed_allocs);
 	zfs_range_tree_destroy(msp->ms_checkpointing);
 	zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
 	zfs_range_tree_destroy(msp->ms_unflushed_frees);
 
 	for (int t = 0; t < TXG_SIZE; t++) {
 		zfs_range_tree_destroy(msp->ms_allocating[t]);
 	}
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		zfs_range_tree_destroy(msp->ms_defer[t]);
 	}
 	ASSERT0(msp->ms_deferspace);
 
 	for (int t = 0; t < TXG_SIZE; t++)
 		ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
 
 	zfs_range_tree_vacate(msp->ms_trim, NULL, NULL);
 	zfs_range_tree_destroy(msp->ms_trim);
 
 	mutex_exit(&msp->ms_lock);
 	cv_destroy(&msp->ms_load_cv);
 	cv_destroy(&msp->ms_flush_cv);
 	mutex_destroy(&msp->ms_lock);
 	mutex_destroy(&msp->ms_sync_lock);
 	ASSERT3U(msp->ms_allocator, ==, -1);
 
 	kmem_free(msp, sizeof (metaslab_t));
 }
 
 /*
  * This table defines a segment size based fragmentation metric that will
  * allow each metaslab to derive its own fragmentation value. This is done
  * by calculating the space in each bucket of the spacemap histogram and
  * multiplying that by the fragmentation metric in this table. Doing
  * this for all buckets and dividing it by the total amount of free
  * space in this metaslab (i.e. the total free space in all buckets) gives
  * us the fragmentation metric. This means that a high fragmentation metric
  * equates to most of the free space being comprised of small segments.
  * Conversely, if the metric is low, then most of the free space is in
  * large segments.
  *
  * This table defines 0% fragmented space using 512M segments. Using this value,
  * we derive the rest of the table. This table originally went up to 16MB, but
  * with larger recordsizes, larger ashifts, and use of raidz3, it is possible
  * to have significantly larger allocations than were previously possible.
  * Since the fragmentation value is never stored on disk, it is possible to
  * change these calculations in the future.
  */
 static const int zfs_frag_table[] = {
 	100,	/* 512B	*/
 	99,	/* 1K	*/
 	97,	/* 2K	*/
 	93,	/* 4K	*/
 	88,	/* 8K	*/
 	83,	/* 16K	*/
 	77,	/* 32K	*/
 	71,	/* 64K	*/
 	64,	/* 128K	*/
 	57,	/* 256K	*/
 	50,	/* 512K	*/
 	43,	/* 1M	*/
 	36,	/* 2M	*/
 	29,	/* 4M	*/
 	23,	/* 8M	*/
 	17,	/* 16M	*/
 	12,	/* 32M	*/
 	7,	/* 64M	*/
 	3,	/* 128M	*/
 	1,	/* 256M	*/
 	0,	/* 512M	*/
 };
 #define	FRAGMENTATION_TABLE_SIZE \
 	(sizeof (zfs_frag_table)/(sizeof (zfs_frag_table[0])))
 
 /*
  * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
  * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
  * been upgraded and does not support this metric. Otherwise, the return
  * value should be in the range [0, 100].
  */
 static void
 metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	uint64_t fragmentation = 0;
 	uint64_t total = 0;
 	boolean_t feature_enabled = spa_feature_is_enabled(spa,
 	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
 
 	if (!feature_enabled) {
 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
 		return;
 	}
 
 	/*
 	 * A null space map means that the entire metaslab is free
 	 * and thus is not fragmented.
 	 */
 	if (msp->ms_sm == NULL) {
 		msp->ms_fragmentation = 0;
 		return;
 	}
 
 	/*
 	 * If this metaslab's space map has not been upgraded, flag it
 	 * so that we upgrade next time we encounter it.
 	 */
 	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
 		uint64_t txg = spa_syncing_txg(spa);
 		vdev_t *vd = msp->ms_group->mg_vd;
 
 		/*
 		 * If we've reached the final dirty txg, then we must
 		 * be shutting down the pool. We don't want to dirty
 		 * any data past this point so skip setting the condense
 		 * flag. We can retry this action the next time the pool
 		 * is imported. We also skip marking this metaslab for
 		 * condensing if the caller has explicitly set nodirty.
 		 */
 		if (!nodirty &&
 		    spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
 			msp->ms_condense_wanted = B_TRUE;
 			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 			zfs_dbgmsg("txg %llu, requesting force condense: "
 			    "ms_id %llu, vdev_id %llu", (u_longlong_t)txg,
 			    (u_longlong_t)msp->ms_id,
 			    (u_longlong_t)vd->vdev_id);
 		}
 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
 		return;
 	}
 
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		uint64_t space = 0;
 		uint8_t shift = msp->ms_sm->sm_shift;
 
 		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
 		    FRAGMENTATION_TABLE_SIZE - 1);
 
 		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
 			continue;
 
 		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
 		total += space;
 
 		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
 		fragmentation += space * zfs_frag_table[idx];
 	}
 
 	if (total > 0)
 		fragmentation /= total;
 	ASSERT3U(fragmentation, <=, 100);
 
 	msp->ms_fragmentation = fragmentation;
 }
 
 /*
  * Compute a weight -- a selection preference value -- for the given metaslab.
  * This is based on the amount of free space, the level of fragmentation,
  * the LBA range, and whether the metaslab is loaded.
  */
 static uint64_t
 metaslab_space_weight(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	uint64_t weight, space;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * The baseline weight is the metaslab's free space.
 	 */
 	space = msp->ms_size - metaslab_allocated_space(msp);
 
 	if (metaslab_fragmentation_factor_enabled &&
 	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
 		/*
 		 * Use the fragmentation information to inversely scale
 		 * down the baseline weight. We need to ensure that we
 		 * don't exclude this metaslab completely when it's 100%
 		 * fragmented. To avoid this we reduce the fragmented value
 		 * by 1.
 		 */
 		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
 
 		/*
 		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
 		 * this metaslab again. The fragmentation metric may have
 		 * decreased the space to something smaller than
 		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
 		 * so that we can consume any remaining space.
 		 */
 		if (space > 0 && space < SPA_MINBLOCKSIZE)
 			space = SPA_MINBLOCKSIZE;
 	}
 	weight = space;
 
 	/*
 	 * Modern disks have uniform bit density and constant angular velocity.
 	 * Therefore, the outer recording zones are faster (higher bandwidth)
 	 * than the inner zones by the ratio of outer to inner track diameter,
 	 * which is typically around 2:1.  We account for this by assigning
 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
 	 * In effect, this means that we'll select the metaslab with the most
 	 * free bandwidth rather than simply the one with the most free space.
 	 */
 	if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
 		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
 		ASSERT(weight >= space && weight <= 2 * space);
 	}
 
 	/*
 	 * If this metaslab is one we're actively using, adjust its
 	 * weight to make it preferable to any inactive metaslab so
 	 * we'll polish it off. If the fragmentation on this metaslab
 	 * has exceed our threshold, then don't mark it active.
 	 */
 	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
 	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
 	}
 
 	WEIGHT_SET_SPACEBASED(weight);
 	return (weight);
 }
 
 /*
  * Return the weight of the specified metaslab, according to the segment-based
  * weighting algorithm. The metaslab must be loaded. This function can
  * be called within a sync pass since it relies only on the metaslab's
  * range tree which is always accurate when the metaslab is loaded.
  */
 static uint64_t
 metaslab_weight_from_range_tree(metaslab_t *msp)
 {
 	uint64_t weight = 0;
 	uint32_t segments = 0;
 
 	ASSERT(msp->ms_loaded);
 
-	for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
+	for (int i = ZFS_RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
 	    i--) {
 		uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
 
 		segments <<= 1;
 		segments += msp->ms_allocatable->rt_histogram[i];
 
 		/*
 		 * The range tree provides more precision than the space map
 		 * and must be downgraded so that all values fit within the
 		 * space map's histogram. This allows us to compare loaded
 		 * vs. unloaded metaslabs to determine which metaslab is
 		 * considered "best".
 		 */
 		if (i > max_idx)
 			continue;
 
 		if (segments != 0) {
 			WEIGHT_SET_COUNT(weight, segments);
 			WEIGHT_SET_INDEX(weight, i);
 			WEIGHT_SET_ACTIVE(weight, 0);
 			break;
 		}
 	}
 	return (weight);
 }
 
 /*
  * Calculate the weight based on the on-disk histogram. Should be applied
  * only to unloaded metaslabs  (i.e no incoming allocations) in-order to
  * give results consistent with the on-disk state
  */
 static uint64_t
 metaslab_weight_from_spacemap(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	ASSERT(!msp->ms_loaded);
 	ASSERT(sm != NULL);
 	ASSERT3U(space_map_object(sm), !=, 0);
 	ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
 
 	/*
 	 * Create a joint histogram from all the segments that have made
 	 * it to the metaslab's space map histogram, that are not yet
 	 * available for allocation because they are still in the freeing
 	 * pipeline (e.g. freeing, freed, and defer trees). Then subtract
 	 * these segments from the space map's histogram to get a more
 	 * accurate weight.
 	 */
 	uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 		deferspace_histogram[i] += msp->ms_synchist[i];
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 			deferspace_histogram[i] += msp->ms_deferhist[t][i];
 		}
 	}
 
 	uint64_t weight = 0;
 	for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
 		ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
 		    deferspace_histogram[i]);
 		uint64_t count =
 		    sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
 		if (count != 0) {
 			WEIGHT_SET_COUNT(weight, count);
 			WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
 			WEIGHT_SET_ACTIVE(weight, 0);
 			break;
 		}
 	}
 	return (weight);
 }
 
 /*
  * Compute a segment-based weight for the specified metaslab. The weight
  * is determined by highest bucket in the histogram. The information
  * for the highest bucket is encoded into the weight value.
  */
 static uint64_t
 metaslab_segment_weight(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	uint64_t weight = 0;
 	uint8_t shift = mg->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * The metaslab is completely free.
 	 */
 	if (metaslab_allocated_space(msp) == 0) {
 		int idx = highbit64(msp->ms_size) - 1;
 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
 
 		if (idx < max_idx) {
 			WEIGHT_SET_COUNT(weight, 1ULL);
 			WEIGHT_SET_INDEX(weight, idx);
 		} else {
 			WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
 			WEIGHT_SET_INDEX(weight, max_idx);
 		}
 		WEIGHT_SET_ACTIVE(weight, 0);
 		ASSERT(!WEIGHT_IS_SPACEBASED(weight));
 		return (weight);
 	}
 
 	ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
 
 	/*
 	 * If the metaslab is fully allocated then just make the weight 0.
 	 */
 	if (metaslab_allocated_space(msp) == msp->ms_size)
 		return (0);
 	/*
 	 * If the metaslab is already loaded, then use the range tree to
 	 * determine the weight. Otherwise, we rely on the space map information
 	 * to generate the weight.
 	 */
 	if (msp->ms_loaded) {
 		weight = metaslab_weight_from_range_tree(msp);
 	} else {
 		weight = metaslab_weight_from_spacemap(msp);
 	}
 
 	/*
 	 * If the metaslab was active the last time we calculated its weight
 	 * then keep it active. We want to consume the entire region that
 	 * is associated with this weight.
 	 */
 	if (msp->ms_activation_weight != 0 && weight != 0)
 		WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
 	return (weight);
 }
 
 /*
  * Determine if we should attempt to allocate from this metaslab. If the
  * metaslab is loaded, then we can determine if the desired allocation
  * can be satisfied by looking at the size of the maximum free segment
  * on that metaslab. Otherwise, we make our decision based on the metaslab's
  * weight. For segment-based weighting we can determine the maximum
  * allocation based on the index encoded in its value. For space-based
  * weights we rely on the entire weight (excluding the weight-type bit).
  */
 static boolean_t
 metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
 {
 	/*
 	 * This case will usually but not always get caught by the checks below;
 	 * metaslabs can be loaded by various means, including the trim and
 	 * initialize code. Once that happens, without this check they are
 	 * allocatable even before they finish their first txg sync.
 	 */
 	if (unlikely(msp->ms_new))
 		return (B_FALSE);
 
 	/*
 	 * If the metaslab is loaded, ms_max_size is definitive and we can use
 	 * the fast check. If it's not, the ms_max_size is a lower bound (once
 	 * set), and we should use the fast check as long as we're not in
 	 * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
 	 * seconds since the metaslab was unloaded.
 	 */
 	if (msp->ms_loaded ||
 	    (msp->ms_max_size != 0 && !try_hard && gethrtime() <
 	    msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
 		return (msp->ms_max_size >= asize);
 
 	boolean_t should_allocate;
 	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
 		/*
 		 * The metaslab segment weight indicates segments in the
 		 * range [2^i, 2^(i+1)), where i is the index in the weight.
 		 * Since the asize might be in the middle of the range, we
 		 * should attempt the allocation if asize < 2^(i+1).
 		 */
 		should_allocate = (asize <
 		    1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
 	} else {
 		should_allocate = (asize <=
 		    (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
 	}
 
 	return (should_allocate);
 }
 
 static uint64_t
 metaslab_weight(metaslab_t *msp, boolean_t nodirty)
 {
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	uint64_t weight;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	metaslab_set_fragmentation(msp, nodirty);
 
 	/*
 	 * Update the maximum size. If the metaslab is loaded, this will
 	 * ensure that we get an accurate maximum size if newly freed space
 	 * has been added back into the free tree. If the metaslab is
 	 * unloaded, we check if there's a larger free segment in the
 	 * unflushed frees. This is a lower bound on the largest allocatable
 	 * segment size. Coalescing of adjacent entries may reveal larger
 	 * allocatable segments, but we aren't aware of those until loading
 	 * the space map into a range tree.
 	 */
 	if (msp->ms_loaded) {
 		msp->ms_max_size = metaslab_largest_allocatable(msp);
 	} else {
 		msp->ms_max_size = MAX(msp->ms_max_size,
 		    metaslab_largest_unflushed_free(msp));
 	}
 
 	/*
 	 * Segment-based weighting requires space map histogram support.
 	 */
 	if (zfs_metaslab_segment_weight_enabled &&
 	    spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
 	    (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
 	    sizeof (space_map_phys_t))) {
 		weight = metaslab_segment_weight(msp);
 	} else {
 		weight = metaslab_space_weight(msp);
 	}
 	return (weight);
 }
 
 void
 metaslab_recalculate_weight_and_sort(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/* note: we preserve the mask (e.g. indication of primary, etc..) */
 	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 	metaslab_group_sort(msp->ms_group, msp,
 	    metaslab_weight(msp, B_FALSE) | was_active);
 }
 
 static int
 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
     int allocator, uint64_t activation_weight)
 {
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * If we're activating for the claim code, we don't want to actually
 	 * set the metaslab up for a specific allocator.
 	 */
 	if (activation_weight == METASLAB_WEIGHT_CLAIM) {
 		ASSERT0(msp->ms_activation_weight);
 		msp->ms_activation_weight = msp->ms_weight;
 		metaslab_group_sort(mg, msp, msp->ms_weight |
 		    activation_weight);
 		return (0);
 	}
 
 	metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
 	    &mga->mga_primary : &mga->mga_secondary);
 
 	mutex_enter(&mg->mg_lock);
 	if (*mspp != NULL) {
 		mutex_exit(&mg->mg_lock);
 		return (EEXIST);
 	}
 
 	*mspp = msp;
 	ASSERT3S(msp->ms_allocator, ==, -1);
 	msp->ms_allocator = allocator;
 	msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
 
 	ASSERT0(msp->ms_activation_weight);
 	msp->ms_activation_weight = msp->ms_weight;
 	metaslab_group_sort_impl(mg, msp,
 	    msp->ms_weight | activation_weight);
 	mutex_exit(&mg->mg_lock);
 
 	return (0);
 }
 
 static int
 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * The current metaslab is already activated for us so there
 	 * is nothing to do. Already activated though, doesn't mean
 	 * that this metaslab is activated for our allocator nor our
 	 * requested activation weight. The metaslab could have started
 	 * as an active one for our allocator but changed allocators
 	 * while we were waiting to grab its ms_lock or we stole it
 	 * [see find_valid_metaslab()]. This means that there is a
 	 * possibility of passivating a metaslab of another allocator
 	 * or from a different activation mask, from this thread.
 	 */
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
 		ASSERT(msp->ms_loaded);
 		return (0);
 	}
 
 	int error = metaslab_load(msp);
 	if (error != 0) {
 		metaslab_group_sort(msp->ms_group, msp, 0);
 		return (error);
 	}
 
 	/*
 	 * When entering metaslab_load() we may have dropped the
 	 * ms_lock because we were loading this metaslab, or we
 	 * were waiting for another thread to load it for us. In
 	 * that scenario, we recheck the weight of the metaslab
 	 * to see if it was activated by another thread.
 	 *
 	 * If the metaslab was activated for another allocator or
 	 * it was activated with a different activation weight (e.g.
 	 * we wanted to make it a primary but it was activated as
 	 * secondary) we return error (EBUSY).
 	 *
 	 * If the metaslab was activated for the same allocator
 	 * and requested activation mask, skip activating it.
 	 */
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
 		if (msp->ms_allocator != allocator)
 			return (EBUSY);
 
 		if ((msp->ms_weight & activation_weight) == 0)
 			return (SET_ERROR(EBUSY));
 
 		EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY),
 		    msp->ms_primary);
 		return (0);
 	}
 
 	/*
 	 * If the metaslab has literally 0 space, it will have weight 0. In
 	 * that case, don't bother activating it. This can happen if the
 	 * metaslab had space during find_valid_metaslab, but another thread
 	 * loaded it and used all that space while we were waiting to grab the
 	 * lock.
 	 */
 	if (msp->ms_weight == 0) {
 		ASSERT0(zfs_range_tree_space(msp->ms_allocatable));
 		return (SET_ERROR(ENOSPC));
 	}
 
 	if ((error = metaslab_activate_allocator(msp->ms_group, msp,
 	    allocator, activation_weight)) != 0) {
 		return (error);
 	}
 
 	ASSERT(msp->ms_loaded);
 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 
 	return (0);
 }
 
 static void
 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
     uint64_t weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 
 	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
 		metaslab_group_sort(mg, msp, weight);
 		return;
 	}
 
 	mutex_enter(&mg->mg_lock);
 	ASSERT3P(msp->ms_group, ==, mg);
 	ASSERT3S(0, <=, msp->ms_allocator);
 	ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
 
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator];
 	if (msp->ms_primary) {
 		ASSERT3P(mga->mga_primary, ==, msp);
 		ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
 		mga->mga_primary = NULL;
 	} else {
 		ASSERT3P(mga->mga_secondary, ==, msp);
 		ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 		mga->mga_secondary = NULL;
 	}
 	msp->ms_allocator = -1;
 	metaslab_group_sort_impl(mg, msp, weight);
 	mutex_exit(&mg->mg_lock);
 }
 
 static void
 metaslab_passivate(metaslab_t *msp, uint64_t weight)
 {
 	uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE;
 
 	/*
 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
 	 * this metaslab again.  In that case, it had better be empty,
 	 * or we would be leaving space on the table.
 	 */
 	ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
 	    size >= SPA_MINBLOCKSIZE ||
 	    zfs_range_tree_space(msp->ms_allocatable) == 0);
 	ASSERT0(weight & METASLAB_ACTIVE_MASK);
 
 	ASSERT(msp->ms_activation_weight != 0);
 	msp->ms_activation_weight = 0;
 	metaslab_passivate_allocator(msp->ms_group, msp, weight);
 	ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
 }
 
 /*
  * Segment-based metaslabs are activated once and remain active until
  * we either fail an allocation attempt (similar to space-based metaslabs)
  * or have exhausted the free space in zfs_metaslab_switch_threshold
  * buckets since the metaslab was activated. This function checks to see
  * if we've exhausted the zfs_metaslab_switch_threshold buckets in the
  * metaslab and passivates it proactively. This will allow us to select a
  * metaslab with a larger contiguous region, if any, remaining within this
  * metaslab group. If we're in sync pass > 1, then we continue using this
  * metaslab so that we don't dirty more block and cause more sync passes.
  */
 static void
 metaslab_segment_may_passivate(metaslab_t *msp)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
 		return;
 
 	/*
 	 * Since we are in the middle of a sync pass, the most accurate
 	 * information that is accessible to us is the in-core range tree
 	 * histogram; calculate the new weight based on that information.
 	 */
 	uint64_t weight = metaslab_weight_from_range_tree(msp);
 	int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
 	int current_idx = WEIGHT_GET_INDEX(weight);
 
 	if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
 		metaslab_passivate(msp, weight);
 }
 
 static void
 metaslab_preload(void *arg)
 {
 	metaslab_t *msp = arg;
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 	spa_t *spa = mc->mc_spa;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
 
 	mutex_enter(&msp->ms_lock);
 	(void) metaslab_load(msp);
 	metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
 	mutex_exit(&msp->ms_lock);
 	spl_fstrans_unmark(cookie);
 }
 
 static void
 metaslab_group_preload(metaslab_group_t *mg)
 {
 	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_t *msp;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	int m = 0;
 
 	if (spa_shutting_down(spa) || !metaslab_preload_enabled)
 		return;
 
 	mutex_enter(&mg->mg_lock);
 
 	/*
 	 * Load the next potential metaslabs
 	 */
 	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
 		ASSERT3P(msp->ms_group, ==, mg);
 
 		/*
 		 * We preload only the maximum number of metaslabs specified
 		 * by metaslab_preload_limit. If a metaslab is being forced
 		 * to condense then we preload it too. This will ensure
 		 * that force condensing happens in the next txg.
 		 */
 		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
 			continue;
 		}
 
 		VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload,
 		    msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0))
 		    != TASKQID_INVALID);
 	}
 	mutex_exit(&mg->mg_lock);
 }
 
 /*
  * Determine if the space map's on-disk footprint is past our tolerance for
  * inefficiency. We would like to use the following criteria to make our
  * decision:
  *
  * 1. Do not condense if the size of the space map object would dramatically
  *    increase as a result of writing out the free space range tree.
  *
  * 2. Condense if the on on-disk space map representation is at least
  *    zfs_condense_pct/100 times the size of the optimal representation
  *    (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
  *
  * 3. Do not condense if the on-disk size of the space map does not actually
  *    decrease.
  *
  * Unfortunately, we cannot compute the on-disk size of the space map in this
  * context because we cannot accurately compute the effects of compression, etc.
  * Instead, we apply the heuristic described in the block comment for
  * zfs_metaslab_condense_block_threshold - we only condense if the space used
  * is greater than a threshold number of blocks.
  */
 static boolean_t
 metaslab_should_condense(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	vdev_t *vd = msp->ms_group->mg_vd;
 	uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 	ASSERT(sm != NULL);
 	ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
 
 	/*
 	 * We always condense metaslabs that are empty and metaslabs for
 	 * which a condense request has been made.
 	 */
 	if (zfs_range_tree_numsegs(msp->ms_allocatable) == 0 ||
 	    msp->ms_condense_wanted)
 		return (B_TRUE);
 
 	uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
 	uint64_t object_size = space_map_length(sm);
 	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
 	    msp->ms_allocatable, SM_NO_VDEVID);
 
 	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
 }
 
 /*
  * Condense the on-disk space map representation to its minimized form.
  * The minimized form consists of a small number of allocations followed
  * by the entries of the free range tree (ms_allocatable). The condensed
  * spacemap contains all the entries of previous TXGs (including those in
  * the pool-wide log spacemaps; thus this is effectively a superset of
  * metaslab_flush()), but this TXG's entries still need to be written.
  */
 static void
 metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
 {
 	zfs_range_tree_t *condense_tree;
 	space_map_t *sm = msp->ms_sm;
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 	ASSERT(msp->ms_sm != NULL);
 
 	/*
 	 * In order to condense the space map, we need to change it so it
 	 * only describes which segments are currently allocated and free.
 	 *
 	 * All the current free space resides in the ms_allocatable, all
 	 * the ms_defer trees, and all the ms_allocating trees. We ignore
 	 * ms_freed because it is empty because we're in sync pass 1. We
 	 * ignore ms_freeing because these changes are not yet reflected
 	 * in the spacemap (they will be written later this txg).
 	 *
 	 * So to truncate the space map to represent all the entries of
 	 * previous TXGs we do the following:
 	 *
 	 * 1] We create a range tree (condense tree) that is 100% empty.
 	 * 2] We add to it all segments found in the ms_defer trees
 	 *    as those segments are marked as free in the original space
 	 *    map. We do the same with the ms_allocating trees for the same
 	 *    reason. Adding these segments should be a relatively
 	 *    inexpensive operation since we expect these trees to have a
 	 *    small number of nodes.
 	 * 3] We vacate any unflushed allocs, since they are not frees we
 	 *    need to add to the condense tree. Then we vacate any
 	 *    unflushed frees as they should already be part of ms_allocatable.
 	 * 4] At this point, we would ideally like to add all segments
 	 *    in the ms_allocatable tree from the condense tree. This way
 	 *    we would write all the entries of the condense tree as the
 	 *    condensed space map, which would only contain freed
 	 *    segments with everything else assumed to be allocated.
 	 *
 	 *    Doing so can be prohibitively expensive as ms_allocatable can
 	 *    be large, and therefore computationally expensive to add to
 	 *    the condense_tree. Instead we first sync out an entry marking
 	 *    everything as allocated, then the condense_tree and then the
 	 *    ms_allocatable, in the condensed space map. While this is not
 	 *    optimal, it is typically close to optimal and more importantly
 	 *    much cheaper to compute.
 	 *
 	 * 5] Finally, as both of the unflushed trees were written to our
 	 *    new and condensed metaslab space map, we basically flushed
 	 *    all the unflushed changes to disk, thus we call
 	 *    metaslab_flush_update().
 	 */
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 	ASSERT(zfs_range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
 
 	zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, "
 	    "spa %s, smp size %llu, segments %llu, forcing condense=%s",
 	    (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp,
 	    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 	    spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm),
 	    (u_longlong_t)zfs_range_tree_numsegs(msp->ms_allocatable),
 	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
 
 	msp->ms_condense_wanted = B_FALSE;
 
 	zfs_range_seg_type_t type;
 	uint64_t shift, start;
 	type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
 	    &start, &shift);
 
 	condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift);
 
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		zfs_range_tree_walk(msp->ms_defer[t],
 		    zfs_range_tree_add, condense_tree);
 	}
 
 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 		zfs_range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
 		    zfs_range_tree_add, condense_tree);
 	}
 
 	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 	    metaslab_unflushed_changes_memused(msp));
 	spa->spa_unflushed_stats.sus_memused -=
 	    metaslab_unflushed_changes_memused(msp);
 	zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
 	zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
 
 	/*
 	 * We're about to drop the metaslab's lock thus allowing other
 	 * consumers to change it's content. Set the metaslab's ms_condensing
 	 * flag to ensure that allocations on this metaslab do not occur
 	 * while we're in the middle of committing it to disk. This is only
 	 * critical for ms_allocatable as all other range trees use per TXG
 	 * views of their content.
 	 */
 	msp->ms_condensing = B_TRUE;
 
 	mutex_exit(&msp->ms_lock);
 	uint64_t object = space_map_object(msp->ms_sm);
 	space_map_truncate(sm,
 	    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
 	    zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
 
 	/*
 	 * space_map_truncate() may have reallocated the spacemap object.
 	 * If so, update the vdev_ms_array.
 	 */
 	if (space_map_object(msp->ms_sm) != object) {
 		object = space_map_object(msp->ms_sm);
 		dmu_write(spa->spa_meta_objset,
 		    msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
 		    msp->ms_id, sizeof (uint64_t), &object, tx);
 	}
 
 	/*
 	 * Note:
 	 * When the log space map feature is enabled, each space map will
 	 * always have ALLOCS followed by FREES for each sync pass. This is
 	 * typically true even when the log space map feature is disabled,
 	 * except from the case where a metaslab goes through metaslab_sync()
 	 * and gets condensed. In that case the metaslab's space map will have
 	 * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
 	 * followed by FREES (due to space_map_write() in metaslab_sync()) for
 	 * sync pass 1.
 	 */
 	zfs_range_tree_t *tmp_tree = zfs_range_tree_create(NULL, type, NULL,
 	    start, shift);
 	zfs_range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
 	space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
 	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
 	space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx);
 
 	zfs_range_tree_vacate(condense_tree, NULL, NULL);
 	zfs_range_tree_destroy(condense_tree);
 	zfs_range_tree_vacate(tmp_tree, NULL, NULL);
 	zfs_range_tree_destroy(tmp_tree);
 	mutex_enter(&msp->ms_lock);
 
 	msp->ms_condensing = B_FALSE;
 	metaslab_flush_update(msp, tx);
 }
 
 static void
 metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	ASSERT(spa_syncing_log_sm(spa) != NULL);
 	ASSERT(msp->ms_sm != NULL);
 	ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs));
 	ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees));
 
 	mutex_enter(&spa->spa_flushed_ms_lock);
 	metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
 	metaslab_set_unflushed_dirty(msp, B_TRUE);
 	avl_add(&spa->spa_metaslabs_by_flushed, msp);
 	mutex_exit(&spa->spa_flushed_ms_lock);
 
 	spa_log_sm_increment_current_mscount(spa);
 	spa_log_summary_add_flushed_metaslab(spa, B_TRUE);
 }
 
 void
 metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	ASSERT(spa_syncing_log_sm(spa) != NULL);
 	ASSERT(msp->ms_sm != NULL);
 	ASSERT(metaslab_unflushed_txg(msp) != 0);
 	ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
 	ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs));
 	ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees));
 
 	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
 
 	/* update metaslab's position in our flushing tree */
 	uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
 	boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp);
 	mutex_enter(&spa->spa_flushed_ms_lock);
 	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
 	metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
 	metaslab_set_unflushed_dirty(msp, dirty);
 	avl_add(&spa->spa_metaslabs_by_flushed, msp);
 	mutex_exit(&spa->spa_flushed_ms_lock);
 
 	/* update metaslab counts of spa_log_sm_t nodes */
 	spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
 	spa_log_sm_increment_current_mscount(spa);
 
 	/* update log space map summary */
 	spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg,
 	    ms_prev_flushed_dirty);
 	spa_log_summary_add_flushed_metaslab(spa, dirty);
 
 	/* cleanup obsolete logs if any */
 	spa_cleanup_old_sm_logs(spa, tx);
 }
 
 /*
  * Called when the metaslab has been flushed (its own spacemap now reflects
  * all the contents of the pool-wide spacemap log). Updates the metaslab's
  * metadata and any pool-wide related log space map data (e.g. summary,
  * obsolete logs, etc..) to reflect that.
  */
 static void
 metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	spa_t *spa = mg->mg_vd->vdev_spa;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 
 	/*
 	 * Just because a metaslab got flushed, that doesn't mean that
 	 * it will pass through metaslab_sync_done(). Thus, make sure to
 	 * update ms_synced_length here in case it doesn't.
 	 */
 	msp->ms_synced_length = space_map_length(msp->ms_sm);
 
 	/*
 	 * We may end up here from metaslab_condense() without the
 	 * feature being active. In that case this is a no-op.
 	 */
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) ||
 	    metaslab_unflushed_txg(msp) == 0)
 		return;
 
 	metaslab_unflushed_bump(msp, tx, B_FALSE);
 }
 
 boolean_t
 metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	ASSERT(msp->ms_sm != NULL);
 	ASSERT(metaslab_unflushed_txg(msp) != 0);
 	ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
 
 	/*
 	 * There is nothing wrong with flushing the same metaslab twice, as
 	 * this codepath should work on that case. However, the current
 	 * flushing scheme makes sure to avoid this situation as we would be
 	 * making all these calls without having anything meaningful to write
 	 * to disk. We assert this behavior here.
 	 */
 	ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
 
 	/*
 	 * We can not flush while loading, because then we would
 	 * not load the ms_unflushed_{allocs,frees}.
 	 */
 	if (msp->ms_loading)
 		return (B_FALSE);
 
 	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 	metaslab_verify_weight_and_frag(msp);
 
 	/*
 	 * Metaslab condensing is effectively flushing. Therefore if the
 	 * metaslab can be condensed we can just condense it instead of
 	 * flushing it.
 	 *
 	 * Note that metaslab_condense() does call metaslab_flush_update()
 	 * so we can just return immediately after condensing. We also
 	 * don't need to care about setting ms_flushing or broadcasting
 	 * ms_flush_cv, even if we temporarily drop the ms_lock in
 	 * metaslab_condense(), as the metaslab is already loaded.
 	 */
 	if (msp->ms_loaded && metaslab_should_condense(msp)) {
 		metaslab_group_t *mg = msp->ms_group;
 
 		/*
 		 * For all histogram operations below refer to the
 		 * comments of metaslab_sync() where we follow a
 		 * similar procedure.
 		 */
 		metaslab_group_histogram_verify(mg);
 		metaslab_class_histogram_verify(mg->mg_class);
 		metaslab_group_histogram_remove(mg, msp);
 
 		metaslab_condense(msp, tx);
 
 		space_map_histogram_clear(msp->ms_sm);
 		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
 		ASSERT(zfs_range_tree_is_empty(msp->ms_freed));
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			space_map_histogram_add(msp->ms_sm,
 			    msp->ms_defer[t], tx);
 		}
 		metaslab_aux_histograms_update(msp);
 
 		metaslab_group_histogram_add(mg, msp);
 		metaslab_group_histogram_verify(mg);
 		metaslab_class_histogram_verify(mg->mg_class);
 
 		metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 
 		/*
 		 * Since we recreated the histogram (and potentially
 		 * the ms_sm too while condensing) ensure that the
 		 * weight is updated too because we are not guaranteed
 		 * that this metaslab is dirty and will go through
 		 * metaslab_sync_done().
 		 */
 		metaslab_recalculate_weight_and_sort(msp);
 		return (B_TRUE);
 	}
 
 	msp->ms_flushing = B_TRUE;
 	uint64_t sm_len_before = space_map_length(msp->ms_sm);
 
 	mutex_exit(&msp->ms_lock);
 	space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
 	    SM_NO_VDEVID, tx);
 	space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
 	    SM_NO_VDEVID, tx);
 	mutex_enter(&msp->ms_lock);
 
 	uint64_t sm_len_after = space_map_length(msp->ms_sm);
 	if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
 		zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
 		    "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
 		    "appended %llu bytes", (u_longlong_t)dmu_tx_get_txg(tx),
 		    spa_name(spa),
 		    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 		    (u_longlong_t)msp->ms_id,
 		    (u_longlong_t)zfs_range_tree_space(
 		    msp->ms_unflushed_allocs),
 		    (u_longlong_t)zfs_range_tree_space(
 		    msp->ms_unflushed_frees),
 		    (u_longlong_t)(sm_len_after - sm_len_before));
 	}
 
 	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 	    metaslab_unflushed_changes_memused(msp));
 	spa->spa_unflushed_stats.sus_memused -=
 	    metaslab_unflushed_changes_memused(msp);
 	zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
 	zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
 
 	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 	metaslab_verify_weight_and_frag(msp);
 
 	metaslab_flush_update(msp, tx);
 
 	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 	metaslab_verify_weight_and_frag(msp);
 
 	msp->ms_flushing = B_FALSE;
 	cv_broadcast(&msp->ms_flush_cv);
 	return (B_TRUE);
 }
 
 /*
  * Write a metaslab to disk in the context of the specified transaction group.
  */
 void
 metaslab_sync(metaslab_t *msp, uint64_t txg)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
 	zfs_range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
 	dmu_tx_t *tx;
 
 	ASSERT(!vd->vdev_ishole);
 
 	/*
 	 * This metaslab has just been added so there's no work to do now.
 	 */
 	if (msp->ms_new) {
 		ASSERT0(zfs_range_tree_space(alloctree));
 		ASSERT0(zfs_range_tree_space(msp->ms_freeing));
 		ASSERT0(zfs_range_tree_space(msp->ms_freed));
 		ASSERT0(zfs_range_tree_space(msp->ms_checkpointing));
 		ASSERT0(zfs_range_tree_space(msp->ms_trim));
 		return;
 	}
 
 	/*
 	 * Normally, we don't want to process a metaslab if there are no
 	 * allocations or frees to perform. However, if the metaslab is being
 	 * forced to condense, it's loaded and we're not beyond the final
 	 * dirty txg, we need to let it through. Not condensing beyond the
 	 * final dirty txg prevents an issue where metaslabs that need to be
 	 * condensed but were loaded for other reasons could cause a panic
 	 * here. By only checking the txg in that branch of the conditional,
 	 * we preserve the utility of the VERIFY statements in all other
 	 * cases.
 	 */
 	if (zfs_range_tree_is_empty(alloctree) &&
 	    zfs_range_tree_is_empty(msp->ms_freeing) &&
 	    zfs_range_tree_is_empty(msp->ms_checkpointing) &&
 	    !(msp->ms_loaded && msp->ms_condense_wanted &&
 	    txg <= spa_final_dirty_txg(spa)))
 		return;
 
 
 	VERIFY3U(txg, <=, spa_final_dirty_txg(spa));
 
 	/*
 	 * The only state that can actually be changing concurrently
 	 * with metaslab_sync() is the metaslab's ms_allocatable. No
 	 * other thread can be modifying this txg's alloc, freeing,
 	 * freed, or space_map_phys_t.  We drop ms_lock whenever we
 	 * could call into the DMU, because the DMU can call down to
 	 * us (e.g. via zio_free()) at any time.
 	 *
 	 * The spa_vdev_remove_thread() can be reading metaslab state
 	 * concurrently, and it is locked out by the ms_sync_lock.
 	 * Note that the ms_lock is insufficient for this, because it
 	 * is dropped by space_map_write().
 	 */
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	/*
 	 * Generate a log space map if one doesn't exist already.
 	 */
 	spa_generate_syncing_log_sm(spa, tx);
 
 	if (msp->ms_sm == NULL) {
 		uint64_t new_object = space_map_alloc(mos,
 		    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
 		    zfs_metaslab_sm_blksz_with_log :
 		    zfs_metaslab_sm_blksz_no_log, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
 		    msp->ms_id, sizeof (uint64_t), &new_object, tx);
 
 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
 		    msp->ms_start, msp->ms_size, vd->vdev_ashift));
 		ASSERT(msp->ms_sm != NULL);
 
 		ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs));
 		ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees));
 		ASSERT0(metaslab_allocated_space(msp));
 	}
 
 	if (!zfs_range_tree_is_empty(msp->ms_checkpointing) &&
 	    vd->vdev_checkpoint_sm == NULL) {
 		ASSERT(spa_has_checkpoint(spa));
 
 		uint64_t new_object = space_map_alloc(mos,
 		    zfs_vdev_standard_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
 		    mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
 		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 
 		/*
 		 * We save the space map object as an entry in vdev_top_zap
 		 * so it can be retrieved when the pool is reopened after an
 		 * export or through zdb.
 		 */
 		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
 		    sizeof (new_object), 1, &new_object, tx));
 	}
 
 	mutex_enter(&msp->ms_sync_lock);
 	mutex_enter(&msp->ms_lock);
 
 	/*
 	 * Note: metaslab_condense() clears the space map's histogram.
 	 * Therefore we must verify and remove this histogram before
 	 * condensing.
 	 */
 	metaslab_group_histogram_verify(mg);
 	metaslab_class_histogram_verify(mg->mg_class);
 	metaslab_group_histogram_remove(mg, msp);
 
 	if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
 	    metaslab_should_condense(msp))
 		metaslab_condense(msp, tx);
 
 	/*
 	 * We'll be going to disk to sync our space accounting, thus we
 	 * drop the ms_lock during that time so allocations coming from
 	 * open-context (ZIL) for future TXGs do not block.
 	 */
 	mutex_exit(&msp->ms_lock);
 	space_map_t *log_sm = spa_syncing_log_sm(spa);
 	if (log_sm != NULL) {
 		ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
 		if (metaslab_unflushed_txg(msp) == 0)
 			metaslab_unflushed_add(msp, tx);
 		else if (!metaslab_unflushed_dirty(msp))
 			metaslab_unflushed_bump(msp, tx, B_TRUE);
 
 		space_map_write(log_sm, alloctree, SM_ALLOC,
 		    vd->vdev_id, tx);
 		space_map_write(log_sm, msp->ms_freeing, SM_FREE,
 		    vd->vdev_id, tx);
 		mutex_enter(&msp->ms_lock);
 
 		ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 		    metaslab_unflushed_changes_memused(msp));
 		spa->spa_unflushed_stats.sus_memused -=
 		    metaslab_unflushed_changes_memused(msp);
 		zfs_range_tree_remove_xor_add(alloctree,
 		    msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
 		zfs_range_tree_remove_xor_add(msp->ms_freeing,
 		    msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
 		spa->spa_unflushed_stats.sus_memused +=
 		    metaslab_unflushed_changes_memused(msp);
 	} else {
 		ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
 		    SM_NO_VDEVID, tx);
 		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
 		    SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 	}
 
 	msp->ms_allocated_space += zfs_range_tree_space(alloctree);
 	ASSERT3U(msp->ms_allocated_space, >=,
 	    zfs_range_tree_space(msp->ms_freeing));
 	msp->ms_allocated_space -= zfs_range_tree_space(msp->ms_freeing);
 
 	if (!zfs_range_tree_is_empty(msp->ms_checkpointing)) {
 		ASSERT(spa_has_checkpoint(spa));
 		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 
 		/*
 		 * Since we are doing writes to disk and the ms_checkpointing
 		 * tree won't be changing during that time, we drop the
 		 * ms_lock while writing to the checkpoint space map, for the
 		 * same reason mentioned above.
 		 */
 		mutex_exit(&msp->ms_lock);
 		space_map_write(vd->vdev_checkpoint_sm,
 		    msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 
 		spa->spa_checkpoint_info.sci_dspace +=
 		    zfs_range_tree_space(msp->ms_checkpointing);
 		vd->vdev_stat.vs_checkpoint_space +=
 		    zfs_range_tree_space(msp->ms_checkpointing);
 		ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
 		    -space_map_allocated(vd->vdev_checkpoint_sm));
 
 		zfs_range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
 	}
 
 	if (msp->ms_loaded) {
 		/*
 		 * When the space map is loaded, we have an accurate
 		 * histogram in the range tree. This gives us an opportunity
 		 * to bring the space map's histogram up-to-date so we clear
 		 * it first before updating it.
 		 */
 		space_map_histogram_clear(msp->ms_sm);
 		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
 
 		/*
 		 * Since we've cleared the histogram we need to add back
 		 * any free space that has already been processed, plus
 		 * any deferred space. This allows the on-disk histogram
 		 * to accurately reflect all free space even if some space
 		 * is not yet available for allocation (i.e. deferred).
 		 */
 		space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
 
 		/*
 		 * Add back any deferred free space that has not been
 		 * added back into the in-core free tree yet. This will
 		 * ensure that we don't end up with a space map histogram
 		 * that is completely empty unless the metaslab is fully
 		 * allocated.
 		 */
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			space_map_histogram_add(msp->ms_sm,
 			    msp->ms_defer[t], tx);
 		}
 	}
 
 	/*
 	 * Always add the free space from this sync pass to the space
 	 * map histogram. We want to make sure that the on-disk histogram
 	 * accounts for all free space. If the space map is not loaded,
 	 * then we will lose some accuracy but will correct it the next
 	 * time we load the space map.
 	 */
 	space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
 	metaslab_aux_histograms_update(msp);
 
 	metaslab_group_histogram_add(mg, msp);
 	metaslab_group_histogram_verify(mg);
 	metaslab_class_histogram_verify(mg->mg_class);
 
 	/*
 	 * For sync pass 1, we avoid traversing this txg's free range tree
 	 * and instead will just swap the pointers for freeing and freed.
 	 * We can safely do this since the freed_tree is guaranteed to be
 	 * empty on the initial pass.
 	 *
 	 * Keep in mind that even if we are currently using a log spacemap
 	 * we want current frees to end up in the ms_allocatable (but not
 	 * get appended to the ms_sm) so their ranges can be reused as usual.
 	 */
 	if (spa_sync_pass(spa) == 1) {
 		zfs_range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
 		ASSERT0(msp->ms_allocated_this_txg);
 	} else {
 		zfs_range_tree_vacate(msp->ms_freeing,
 		    zfs_range_tree_add, msp->ms_freed);
 	}
 	msp->ms_allocated_this_txg += zfs_range_tree_space(alloctree);
 	zfs_range_tree_vacate(alloctree, NULL, NULL);
 
 	ASSERT0(zfs_range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
 	ASSERT0(zfs_range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
 	    & TXG_MASK]));
 	ASSERT0(zfs_range_tree_space(msp->ms_freeing));
 	ASSERT0(zfs_range_tree_space(msp->ms_checkpointing));
 
 	mutex_exit(&msp->ms_lock);
 
 	/*
 	 * Verify that the space map object ID has been recorded in the
 	 * vdev_ms_array.
 	 */
 	uint64_t object;
 	VERIFY0(dmu_read(mos, vd->vdev_ms_array,
 	    msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
 	VERIFY3U(object, ==, space_map_object(msp->ms_sm));
 
 	mutex_exit(&msp->ms_sync_lock);
 	dmu_tx_commit(tx);
 }
 
 static void
 metaslab_evict(metaslab_t *msp, uint64_t txg)
 {
 	if (!msp->ms_loaded || msp->ms_disabled != 0)
 		return;
 
 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
 		VERIFY0(zfs_range_tree_space(
 		    msp->ms_allocating[(txg + t) & TXG_MASK]));
 	}
 	if (msp->ms_allocator != -1)
 		metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
 
 	if (!metaslab_debug_unload)
 		metaslab_unload(msp);
 }
 
 /*
  * Called after a transaction group has completely synced to mark
  * all of the metaslab's free space as usable.
  */
 void
 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	zfs_range_tree_t **defer_tree;
 	int64_t alloc_delta, defer_delta;
 	boolean_t defer_allowed = B_TRUE;
 
 	ASSERT(!vd->vdev_ishole);
 
 	mutex_enter(&msp->ms_lock);
 
 	if (msp->ms_new) {
 		/* this is a new metaslab, add its capacity to the vdev */
 		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
 
 		/* there should be no allocations nor frees at this point */
 		VERIFY0(msp->ms_allocated_this_txg);
 		VERIFY0(zfs_range_tree_space(msp->ms_freed));
 	}
 
 	ASSERT0(zfs_range_tree_space(msp->ms_freeing));
 	ASSERT0(zfs_range_tree_space(msp->ms_checkpointing));
 
 	defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
 
 	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
 	    metaslab_class_get_alloc(spa_normal_class(spa));
 	if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing ||
 	    vd->vdev_rz_expanding) {
 		defer_allowed = B_FALSE;
 	}
 
 	defer_delta = 0;
 	alloc_delta = msp->ms_allocated_this_txg -
 	    zfs_range_tree_space(msp->ms_freed);
 
 	if (defer_allowed) {
 		defer_delta = zfs_range_tree_space(msp->ms_freed) -
 		    zfs_range_tree_space(*defer_tree);
 	} else {
 		defer_delta -= zfs_range_tree_space(*defer_tree);
 	}
 	metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
 	    defer_delta, 0);
 
 	if (spa_syncing_log_sm(spa) == NULL) {
 		/*
 		 * If there's a metaslab_load() in progress and we don't have
 		 * a log space map, it means that we probably wrote to the
 		 * metaslab's space map. If this is the case, we need to
 		 * make sure that we wait for the load to complete so that we
 		 * have a consistent view at the in-core side of the metaslab.
 		 */
 		metaslab_load_wait(msp);
 	} else {
 		ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 	}
 
 	/*
 	 * When auto-trimming is enabled, free ranges which are added to
 	 * ms_allocatable are also be added to ms_trim.  The ms_trim tree is
 	 * periodically consumed by the vdev_autotrim_thread() which issues
 	 * trims for all ranges and then vacates the tree.  The ms_trim tree
 	 * can be discarded at any time with the sole consequence of recent
 	 * frees not being trimmed.
 	 */
 	if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
 		zfs_range_tree_walk(*defer_tree, zfs_range_tree_add,
 		    msp->ms_trim);
 		if (!defer_allowed) {
 			zfs_range_tree_walk(msp->ms_freed, zfs_range_tree_add,
 			    msp->ms_trim);
 		}
 	} else {
 		zfs_range_tree_vacate(msp->ms_trim, NULL, NULL);
 	}
 
 	/*
 	 * Move the frees from the defer_tree back to the free
 	 * range tree (if it's loaded). Swap the freed_tree and
 	 * the defer_tree -- this is safe to do because we've
 	 * just emptied out the defer_tree.
 	 */
 	zfs_range_tree_vacate(*defer_tree,
 	    msp->ms_loaded ? zfs_range_tree_add : NULL, msp->ms_allocatable);
 	if (defer_allowed) {
 		zfs_range_tree_swap(&msp->ms_freed, defer_tree);
 	} else {
 		zfs_range_tree_vacate(msp->ms_freed,
 		    msp->ms_loaded ? zfs_range_tree_add : NULL,
 		    msp->ms_allocatable);
 	}
 
 	msp->ms_synced_length = space_map_length(msp->ms_sm);
 
 	msp->ms_deferspace += defer_delta;
 	ASSERT3S(msp->ms_deferspace, >=, 0);
 	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
 	if (msp->ms_deferspace != 0) {
 		/*
 		 * Keep syncing this metaslab until all deferred frees
 		 * are back in circulation.
 		 */
 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 	}
 	metaslab_aux_histograms_update_done(msp, defer_allowed);
 
 	if (msp->ms_new) {
 		msp->ms_new = B_FALSE;
 		mutex_enter(&mg->mg_lock);
 		mg->mg_ms_ready++;
 		mutex_exit(&mg->mg_lock);
 	}
 
 	/*
 	 * Re-sort metaslab within its group now that we've adjusted
 	 * its allocatable space.
 	 */
 	metaslab_recalculate_weight_and_sort(msp);
 
 	ASSERT0(zfs_range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
 	ASSERT0(zfs_range_tree_space(msp->ms_freeing));
 	ASSERT0(zfs_range_tree_space(msp->ms_freed));
 	ASSERT0(zfs_range_tree_space(msp->ms_checkpointing));
 	msp->ms_allocating_total -= msp->ms_allocated_this_txg;
 	msp->ms_allocated_this_txg = 0;
 	mutex_exit(&msp->ms_lock);
 }
 
 void
 metaslab_sync_reassess(metaslab_group_t *mg)
 {
 	spa_t *spa = mg->mg_class->mc_spa;
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 	metaslab_group_alloc_update(mg);
 	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
 
 	/*
 	 * Preload the next potential metaslabs but only on active
 	 * metaslab groups. We can get into a state where the metaslab
 	 * is no longer active since we dirty metaslabs as we remove a
 	 * a device, thus potentially making the metaslab group eligible
 	 * for preloading.
 	 */
 	if (mg->mg_activation_count > 0) {
 		metaslab_group_preload(mg);
 	}
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 }
 
 /*
  * When writing a ditto block (i.e. more than one DVA for a given BP) on
  * the same vdev as an existing DVA of this BP, then try to allocate it
  * on a different metaslab than existing DVAs (i.e. a unique metaslab).
  */
 static boolean_t
 metaslab_is_unique(metaslab_t *msp, dva_t *dva)
 {
 	uint64_t dva_ms_id;
 
 	if (DVA_GET_ASIZE(dva) == 0)
 		return (B_TRUE);
 
 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
 		return (B_TRUE);
 
 	dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
 
 	return (msp->ms_id != dva_ms_id);
 }
 
 /*
  * ==========================================================================
  * Metaslab allocation tracing facility
  * ==========================================================================
  */
 
 /*
  * Add an allocation trace element to the allocation tracing list.
  */
 static void
 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
     int allocator)
 {
 	metaslab_alloc_trace_t *mat;
 
 	if (!metaslab_trace_enabled)
 		return;
 
 	/*
 	 * When the tracing list reaches its maximum we remove
 	 * the second element in the list before adding a new one.
 	 * By removing the second element we preserve the original
 	 * entry as a clue to what allocations steps have already been
 	 * performed.
 	 */
 	if (zal->zal_size == metaslab_trace_max_entries) {
 		metaslab_alloc_trace_t *mat_next;
 #ifdef ZFS_DEBUG
 		panic("too many entries in allocation list");
 #endif
 		METASLABSTAT_BUMP(metaslabstat_trace_over_limit);
 		zal->zal_size--;
 		mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
 		list_remove(&zal->zal_list, mat_next);
 		kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
 	}
 
 	mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
 	list_link_init(&mat->mat_list_node);
 	mat->mat_mg = mg;
 	mat->mat_msp = msp;
 	mat->mat_size = psize;
 	mat->mat_dva_id = dva_id;
 	mat->mat_offset = offset;
 	mat->mat_weight = 0;
 	mat->mat_allocator = allocator;
 
 	if (msp != NULL)
 		mat->mat_weight = msp->ms_weight;
 
 	/*
 	 * The list is part of the zio so locking is not required. Only
 	 * a single thread will perform allocations for a given zio.
 	 */
 	list_insert_tail(&zal->zal_list, mat);
 	zal->zal_size++;
 
 	ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
 }
 
 void
 metaslab_trace_init(zio_alloc_list_t *zal)
 {
 	list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
 	    offsetof(metaslab_alloc_trace_t, mat_list_node));
 	zal->zal_size = 0;
 }
 
 void
 metaslab_trace_fini(zio_alloc_list_t *zal)
 {
 	metaslab_alloc_trace_t *mat;
 
 	while ((mat = list_remove_head(&zal->zal_list)) != NULL)
 		kmem_cache_free(metaslab_alloc_trace_cache, mat);
 	list_destroy(&zal->zal_list);
 	zal->zal_size = 0;
 }
 
 /*
  * ==========================================================================
  * Metaslab block operations
  * ==========================================================================
  */
 
 static void
 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag,
     int flags, int allocator)
 {
 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
 	    (flags & METASLAB_DONT_THROTTLE))
 		return;
 
 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 	if (!mg->mg_class->mc_alloc_throttle_enabled)
 		return;
 
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 	(void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag);
 }
 
 static void
 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
 {
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 	metaslab_class_allocator_t *mca =
 	    &mg->mg_class->mc_allocator[allocator];
 	uint64_t max = mg->mg_max_alloc_queue_depth;
 	uint64_t cur = mga->mga_cur_max_alloc_queue_depth;
 	while (cur < max) {
 		if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth,
 		    cur, cur + 1) == cur) {
 			atomic_inc_64(&mca->mca_alloc_max_slots);
 			return;
 		}
 		cur = mga->mga_cur_max_alloc_queue_depth;
 	}
 }
 
 void
 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, const void *tag,
     int flags, int allocator, boolean_t io_complete)
 {
 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
 	    (flags & METASLAB_DONT_THROTTLE))
 		return;
 
 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 	if (!mg->mg_class->mc_alloc_throttle_enabled)
 		return;
 
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 	(void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag);
 	if (io_complete)
 		metaslab_group_increment_qdepth(mg, allocator);
 }
 
 void
 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, const void *tag,
     int allocator)
 {
 #ifdef ZFS_DEBUG
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
 
 	for (int d = 0; d < ndvas; d++) {
 		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
 		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 		VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag));
 	}
 #endif
 }
 
 static uint64_t
 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 {
 	uint64_t start;
 	zfs_range_tree_t *rt = msp->ms_allocatable;
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	VERIFY(!msp->ms_condensing);
 	VERIFY0(msp->ms_disabled);
 	VERIFY0(msp->ms_new);
 
 	start = mc->mc_ops->msop_alloc(msp, size);
 	if (start != -1ULL) {
 		metaslab_group_t *mg = msp->ms_group;
 		vdev_t *vd = mg->mg_vd;
 
 		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 		VERIFY3U(zfs_range_tree_space(rt) - size, <=, msp->ms_size);
 		zfs_range_tree_remove(rt, start, size);
 		zfs_range_tree_clear(msp->ms_trim, start, size);
 
 		if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
 		zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], start,
 		    size);
 		msp->ms_allocating_total += size;
 
 		/* Track the last successful allocation */
 		msp->ms_alloc_txg = txg;
 		metaslab_verify_space(msp, txg);
 	}
 
 	/*
 	 * Now that we've attempted the allocation we need to update the
 	 * metaslab's maximum block size since it may have changed.
 	 */
 	msp->ms_max_size = metaslab_largest_allocatable(msp);
 	return (start);
 }
 
 /*
  * Find the metaslab with the highest weight that is less than what we've
  * already tried.  In the common case, this means that we will examine each
  * metaslab at most once. Note that concurrent callers could reorder metaslabs
  * by activation/passivation once we have dropped the mg_lock. If a metaslab is
  * activated by another thread, and we fail to allocate from the metaslab we
  * have selected, we may not try the newly-activated metaslab, and instead
  * activate another metaslab.  This is not optimal, but generally does not cause
  * any problems (a possible exception being if every metaslab is completely full
  * except for the newly-activated metaslab which we fail to examine).
  */
 static metaslab_t *
 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
     dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
     boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
     boolean_t *was_active)
 {
 	avl_index_t idx;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	metaslab_t *msp = avl_find(t, search, &idx);
 	if (msp == NULL)
 		msp = avl_nearest(t, idx, AVL_AFTER);
 
 	uint_t tries = 0;
 	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
 		int i;
 
 		if (!try_hard && tries > zfs_metaslab_find_max_tries) {
 			METASLABSTAT_BUMP(metaslabstat_too_many_tries);
 			return (NULL);
 		}
 		tries++;
 
 		if (!metaslab_should_allocate(msp, asize, try_hard)) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_TOO_SMALL, allocator);
 			continue;
 		}
 
 		/*
 		 * If the selected metaslab is condensing or disabled, or
 		 * hasn't gone through a metaslab_sync_done(), then skip it.
 		 */
 		if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new)
 			continue;
 
 		*was_active = msp->ms_allocator != -1;
 		/*
 		 * If we're activating as primary, this is our first allocation
 		 * from this disk, so we don't need to check how close we are.
 		 * If the metaslab under consideration was already active,
 		 * we're getting desperate enough to steal another allocator's
 		 * metaslab, so we still don't care about distances.
 		 */
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
 			break;
 
 		for (i = 0; i < d; i++) {
 			if (want_unique &&
 			    !metaslab_is_unique(msp, &dva[i]))
 				break;  /* try another metaslab */
 		}
 		if (i == d)
 			break;
 	}
 
 	if (msp != NULL) {
 		search->ms_weight = msp->ms_weight;
 		search->ms_start = msp->ms_start + 1;
 		search->ms_allocator = msp->ms_allocator;
 		search->ms_primary = msp->ms_primary;
 	}
 	return (msp);
 }
 
 static void
 metaslab_active_mask_verify(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 		return;
 
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
 		return;
 
 	if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
 		VERIFY3S(msp->ms_allocator, !=, -1);
 		VERIFY(msp->ms_primary);
 		return;
 	}
 
 	if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
 		VERIFY3S(msp->ms_allocator, !=, -1);
 		VERIFY(!msp->ms_primary);
 		return;
 	}
 
 	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 		VERIFY3S(msp->ms_allocator, ==, -1);
 		return;
 	}
 }
 
 static uint64_t
 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
     int allocator, boolean_t try_hard)
 {
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
 
 	uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY;
 	for (int i = 0; i < d; i++) {
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
 		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 			activation_weight = METASLAB_WEIGHT_SECONDARY;
 		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
 		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 			activation_weight = METASLAB_WEIGHT_CLAIM;
 			break;
 		}
 	}
 
 	/*
 	 * If we don't have enough metaslabs active to fill the entire array, we
 	 * just use the 0th slot.
 	 */
 	if (mg->mg_ms_ready < mg->mg_allocators * 3)
 		allocator = 0;
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 
 	ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
 
 	metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
 	search->ms_weight = UINT64_MAX;
 	search->ms_start = 0;
 	/*
 	 * At the end of the metaslab tree are the already-active metaslabs,
 	 * first the primaries, then the secondaries. When we resume searching
 	 * through the tree, we need to consider ms_allocator and ms_primary so
 	 * we start in the location right after where we left off, and don't
 	 * accidentally loop forever considering the same metaslabs.
 	 */
 	search->ms_allocator = -1;
 	search->ms_primary = B_TRUE;
 	for (;;) {
 		boolean_t was_active = B_FALSE;
 
 		mutex_enter(&mg->mg_lock);
 
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
 		    mga->mga_primary != NULL) {
 			msp = mga->mga_primary;
 
 			/*
 			 * Even though we don't hold the ms_lock for the
 			 * primary metaslab, those fields should not
 			 * change while we hold the mg_lock. Thus it is
 			 * safe to make assertions on them.
 			 */
 			ASSERT(msp->ms_primary);
 			ASSERT3S(msp->ms_allocator, ==, allocator);
 			ASSERT(msp->ms_loaded);
 
 			was_active = B_TRUE;
 			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
 		    mga->mga_secondary != NULL) {
 			msp = mga->mga_secondary;
 
 			/*
 			 * See comment above about the similar assertions
 			 * for the primary metaslab.
 			 */
 			ASSERT(!msp->ms_primary);
 			ASSERT3S(msp->ms_allocator, ==, allocator);
 			ASSERT(msp->ms_loaded);
 
 			was_active = B_TRUE;
 			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 		} else {
 			msp = find_valid_metaslab(mg, activation_weight, dva, d,
 			    want_unique, asize, allocator, try_hard, zal,
 			    search, &was_active);
 		}
 
 		mutex_exit(&mg->mg_lock);
 		if (msp == NULL) {
 			kmem_free(search, sizeof (*search));
 			return (-1ULL);
 		}
 		mutex_enter(&msp->ms_lock);
 
 		metaslab_active_mask_verify(msp);
 
 		/*
 		 * This code is disabled out because of issues with
 		 * tracepoints in non-gpl kernel modules.
 		 */
 #if 0
 		DTRACE_PROBE3(ms__activation__attempt,
 		    metaslab_t *, msp, uint64_t, activation_weight,
 		    boolean_t, was_active);
 #endif
 
 		/*
 		 * Ensure that the metaslab we have selected is still
 		 * capable of handling our request. It's possible that
 		 * another thread may have changed the weight while we
 		 * were blocked on the metaslab lock. We check the
 		 * active status first to see if we need to set_selected_txg
 		 * a new metaslab.
 		 */
 		if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
 			ASSERT3S(msp->ms_allocator, ==, -1);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		/*
 		 * If the metaslab was activated for another allocator
 		 * while we were waiting in the ms_lock above, or it's
 		 * a primary and we're seeking a secondary (or vice versa),
 		 * we go back and select a new metaslab.
 		 */
 		if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
 		    (msp->ms_allocator != -1) &&
 		    (msp->ms_allocator != allocator || ((activation_weight ==
 		    METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
 			ASSERT(msp->ms_loaded);
 			ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
 			    msp->ms_allocator != -1);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		/*
 		 * This metaslab was used for claiming regions allocated
 		 * by the ZIL during pool import. Once these regions are
 		 * claimed we don't need to keep the CLAIM bit set
 		 * anymore. Passivate this metaslab to zero its activation
 		 * mask.
 		 */
 		if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
 		    activation_weight != METASLAB_WEIGHT_CLAIM) {
 			ASSERT(msp->ms_loaded);
 			ASSERT3S(msp->ms_allocator, ==, -1);
 			metaslab_passivate(msp, msp->ms_weight &
 			    ~METASLAB_WEIGHT_CLAIM);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		metaslab_set_selected_txg(msp, txg);
 
 		int activation_error =
 		    metaslab_activate(msp, allocator, activation_weight);
 		metaslab_active_mask_verify(msp);
 
 		/*
 		 * If the metaslab was activated by another thread for
 		 * another allocator or activation_weight (EBUSY), or it
 		 * failed because another metaslab was assigned as primary
 		 * for this allocator (EEXIST) we continue using this
 		 * metaslab for our allocation, rather than going on to a
 		 * worse metaslab (we waited for that metaslab to be loaded
 		 * after all).
 		 *
 		 * If the activation failed due to an I/O error or ENOSPC we
 		 * skip to the next metaslab.
 		 */
 		boolean_t activated;
 		if (activation_error == 0) {
 			activated = B_TRUE;
 		} else if (activation_error == EBUSY ||
 		    activation_error == EEXIST) {
 			activated = B_FALSE;
 		} else {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 		ASSERT(msp->ms_loaded);
 
 		/*
 		 * Now that we have the lock, recheck to see if we should
 		 * continue to use this metaslab for this allocation. The
 		 * the metaslab is now loaded so metaslab_should_allocate()
 		 * can accurately determine if the allocation attempt should
 		 * proceed.
 		 */
 		if (!metaslab_should_allocate(msp, asize, try_hard)) {
 			/* Passivate this metaslab and select a new one. */
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_TOO_SMALL, allocator);
 			goto next;
 		}
 
 		/*
 		 * If this metaslab is currently condensing then pick again
 		 * as we can't manipulate this metaslab until it's committed
 		 * to disk. If this metaslab is being initialized, we shouldn't
 		 * allocate from it since the allocated region might be
 		 * overwritten after allocation.
 		 */
 		if (msp->ms_condensing) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_CONDENSING, allocator);
 			if (activated) {
 				metaslab_passivate(msp, msp->ms_weight &
 				    ~METASLAB_ACTIVE_MASK);
 			}
 			mutex_exit(&msp->ms_lock);
 			continue;
 		} else if (msp->ms_disabled > 0) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_DISABLED, allocator);
 			if (activated) {
 				metaslab_passivate(msp, msp->ms_weight &
 				    ~METASLAB_ACTIVE_MASK);
 			}
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		offset = metaslab_block_alloc(msp, asize, txg);
 		metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
 
 		if (offset != -1ULL) {
 			/* Proactively passivate the metaslab, if needed */
 			if (activated)
 				metaslab_segment_may_passivate(msp);
 			break;
 		}
 next:
 		ASSERT(msp->ms_loaded);
 
 		/*
 		 * This code is disabled out because of issues with
 		 * tracepoints in non-gpl kernel modules.
 		 */
 #if 0
 		DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp,
 		    uint64_t, asize);
 #endif
 
 		/*
 		 * We were unable to allocate from this metaslab so determine
 		 * a new weight for this metaslab. Now that we have loaded
 		 * the metaslab we can provide a better hint to the metaslab
 		 * selector.
 		 *
 		 * For space-based metaslabs, we use the maximum block size.
 		 * This information is only available when the metaslab
 		 * is loaded and is more accurate than the generic free
 		 * space weight that was calculated by metaslab_weight().
 		 * This information allows us to quickly compare the maximum
 		 * available allocation in the metaslab to the allocation
 		 * size being requested.
 		 *
 		 * For segment-based metaslabs, determine the new weight
 		 * based on the highest bucket in the range tree. We
 		 * explicitly use the loaded segment weight (i.e. the range
 		 * tree histogram) since it contains the space that is
 		 * currently available for allocation and is accurate
 		 * even within a sync pass.
 		 */
 		uint64_t weight;
 		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
 			weight = metaslab_largest_allocatable(msp);
 			WEIGHT_SET_SPACEBASED(weight);
 		} else {
 			weight = metaslab_weight_from_range_tree(msp);
 		}
 
 		if (activated) {
 			metaslab_passivate(msp, weight);
 		} else {
 			/*
 			 * For the case where we use the metaslab that is
 			 * active for another allocator we want to make
 			 * sure that we retain the activation mask.
 			 *
 			 * Note that we could attempt to use something like
 			 * metaslab_recalculate_weight_and_sort() that
 			 * retains the activation mask here. That function
 			 * uses metaslab_weight() to set the weight though
 			 * which is not as accurate as the calculations
 			 * above.
 			 */
 			weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
 			metaslab_group_sort(mg, msp, weight);
 		}
 		metaslab_active_mask_verify(msp);
 
 		/*
 		 * We have just failed an allocation attempt, check
 		 * that metaslab_should_allocate() agrees. Otherwise,
 		 * we may end up in an infinite loop retrying the same
 		 * metaslab.
 		 */
 		ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
 
 		mutex_exit(&msp->ms_lock);
 	}
 	mutex_exit(&msp->ms_lock);
 	kmem_free(search, sizeof (*search));
 	return (offset);
 }
 
 static uint64_t
 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
     int allocator, boolean_t try_hard)
 {
 	uint64_t offset;
 
 	offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
 	    dva, d, allocator, try_hard);
 
 	mutex_enter(&mg->mg_lock);
 	if (offset == -1ULL) {
 		mg->mg_failed_allocations++;
 		metaslab_trace_add(zal, mg, NULL, asize, d,
 		    TRACE_GROUP_FAILURE, allocator);
 		if (asize == SPA_GANGBLOCKSIZE) {
 			/*
 			 * This metaslab group was unable to allocate
 			 * the minimum gang block size so it must be out of
 			 * space. We must notify the allocation throttle
 			 * to start skipping allocation attempts to this
 			 * metaslab group until more space becomes available.
 			 * Note: this failure cannot be caused by the
 			 * allocation throttle since the allocation throttle
 			 * is only responsible for skipping devices and
 			 * not failing block allocations.
 			 */
 			mg->mg_no_free_space = B_TRUE;
 		}
 	}
 	mg->mg_allocations++;
 	mutex_exit(&mg->mg_lock);
 	return (offset);
 }
 
 /*
  * Allocate a block for the specified i/o.
  */
 int
 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
     zio_alloc_list_t *zal, int allocator)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 	metaslab_group_t *mg, *rotor;
 	vdev_t *vd;
 	boolean_t try_hard = B_FALSE;
 
 	ASSERT(!DVA_IS_VALID(&dva[d]));
 
 	/*
 	 * For testing, make some blocks above a certain size be gang blocks.
 	 * This will result in more split blocks when using device removal,
 	 * and a large number of split blocks coupled with ztest-induced
 	 * damage can result in extremely long reconstruction times.  This
 	 * will also test spilling from special to normal.
 	 */
 	if (psize >= metaslab_force_ganging &&
 	    metaslab_force_ganging_pct > 0 &&
 	    (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) {
 		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
 		    allocator);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	/*
 	 * Start at the rotor and loop through all mgs until we find something.
 	 * Note that there's no locking on mca_rotor or mca_aliquot because
 	 * nothing actually breaks if we miss a few updates -- we just won't
 	 * allocate quite as evenly.  It all balances out over time.
 	 *
 	 * If we are doing ditto or log blocks, try to spread them across
 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
 	 * allocated all of our ditto blocks, then try and spread them out on
 	 * that vdev as much as possible.  If it turns out to not be possible,
 	 * gradually lower our standards until anything becomes acceptable.
 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
 	 * gives us hope of containing our fault domains to something we're
 	 * able to reason about.  Otherwise, any two top-level vdev failures
 	 * will guarantee the loss of data.  With consecutive allocation,
 	 * only two adjacent top-level vdev failures will result in data loss.
 	 *
 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
 	 * ourselves on the same vdev as our gang block header.  That
 	 * way, we can hope for locality in vdev_cache, plus it makes our
 	 * fault domains something tractable.
 	 */
 	if (hintdva) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
 
 		/*
 		 * It's possible the vdev we're using as the hint no
 		 * longer exists or its mg has been closed (e.g. by
 		 * device removal).  Consult the rotor when
 		 * all else fails.
 		 */
 		if (vd != NULL && vd->vdev_mg != NULL) {
 			mg = vdev_get_mg(vd, mc);
 
 			if (flags & METASLAB_HINTBP_AVOID)
 				mg = mg->mg_next;
 		} else {
 			mg = mca->mca_rotor;
 		}
 	} else if (d != 0) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
 		mg = vd->vdev_mg->mg_next;
 	} else {
 		ASSERT(mca->mca_rotor != NULL);
 		mg = mca->mca_rotor;
 	}
 
 	/*
 	 * If the hint put us into the wrong metaslab class, or into a
 	 * metaslab group that has been passivated, just follow the rotor.
 	 */
 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
 		mg = mca->mca_rotor;
 
 	rotor = mg;
 top:
 	do {
 		boolean_t allocatable;
 
 		ASSERT(mg->mg_activation_count == 1);
 		vd = mg->mg_vd;
 
 		/*
 		 * Don't allocate from faulted devices.
 		 */
 		if (try_hard) {
 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
 			allocatable = vdev_allocatable(vd);
 			spa_config_exit(spa, SCL_ZIO, FTAG);
 		} else {
 			allocatable = vdev_allocatable(vd);
 		}
 
 		/*
 		 * Determine if the selected metaslab group is eligible
 		 * for allocations. If we're ganging then don't allow
 		 * this metaslab group to skip allocations since that would
 		 * inadvertently return ENOSPC and suspend the pool
 		 * even though space is still available.
 		 */
 		if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
 			allocatable = metaslab_group_allocatable(mg, rotor,
 			    flags, psize, allocator, d);
 		}
 
 		if (!allocatable) {
 			metaslab_trace_add(zal, mg, NULL, psize, d,
 			    TRACE_NOT_ALLOCATABLE, allocator);
 			goto next;
 		}
 
 		/*
 		 * Avoid writing single-copy data to an unhealthy,
 		 * non-redundant vdev, unless we've already tried all
 		 * other vdevs.
 		 */
 		if (vd->vdev_state < VDEV_STATE_HEALTHY &&
 		    d == 0 && !try_hard && vd->vdev_children == 0) {
 			metaslab_trace_add(zal, mg, NULL, psize, d,
 			    TRACE_VDEV_ERROR, allocator);
 			goto next;
 		}
 
 		ASSERT(mg->mg_class == mc);
 
 		uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
 		/*
 		 * If we don't need to try hard, then require that the
 		 * block be on a different metaslab from any other DVAs
 		 * in this BP (unique=true).  If we are trying hard, then
 		 * allow any metaslab to be used (unique=false).
 		 */
 		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
 		    !try_hard, dva, d, allocator, try_hard);
 
 		if (offset != -1ULL) {
 			/*
 			 * If we've just selected this metaslab group,
 			 * figure out whether the corresponding vdev is
 			 * over- or under-used relative to the pool,
 			 * and set an allocation bias to even it out.
 			 *
 			 * Bias is also used to compensate for unequally
 			 * sized vdevs so that space is allocated fairly.
 			 */
 			if (mca->mca_aliquot == 0 && metaslab_bias_enabled) {
 				vdev_stat_t *vs = &vd->vdev_stat;
 				int64_t vs_free = vs->vs_space - vs->vs_alloc;
 				int64_t mc_free = mc->mc_space - mc->mc_alloc;
 				int64_t ratio;
 
 				/*
 				 * Calculate how much more or less we should
 				 * try to allocate from this device during
 				 * this iteration around the rotor.
 				 *
 				 * This basically introduces a zero-centered
 				 * bias towards the devices with the most
 				 * free space, while compensating for vdev
 				 * size differences.
 				 *
 				 * Examples:
 				 *  vdev V1 = 16M/128M
 				 *  vdev V2 = 16M/128M
 				 *  ratio(V1) = 100% ratio(V2) = 100%
 				 *
 				 *  vdev V1 = 16M/128M
 				 *  vdev V2 = 64M/128M
 				 *  ratio(V1) = 127% ratio(V2) =  72%
 				 *
 				 *  vdev V1 = 16M/128M
 				 *  vdev V2 = 64M/512M
 				 *  ratio(V1) =  40% ratio(V2) = 160%
 				 */
 				ratio = (vs_free * mc->mc_alloc_groups * 100) /
 				    (mc_free + 1);
 				mg->mg_bias = ((ratio - 100) *
 				    (int64_t)mg->mg_aliquot) / 100;
 			} else if (!metaslab_bias_enabled) {
 				mg->mg_bias = 0;
 			}
 
 			if ((flags & METASLAB_ZIL) ||
 			    atomic_add_64_nv(&mca->mca_aliquot, asize) >=
 			    mg->mg_aliquot + mg->mg_bias) {
 				mca->mca_rotor = mg->mg_next;
 				mca->mca_aliquot = 0;
 			}
 
 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
 			DVA_SET_OFFSET(&dva[d], offset);
 			DVA_SET_GANG(&dva[d],
 			    ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
 			DVA_SET_ASIZE(&dva[d], asize);
 
 			return (0);
 		}
 next:
 		mca->mca_rotor = mg->mg_next;
 		mca->mca_aliquot = 0;
 	} while ((mg = mg->mg_next) != rotor);
 
 	/*
 	 * If we haven't tried hard, perhaps do so now.
 	 */
 	if (!try_hard && (zfs_metaslab_try_hard_before_gang ||
 	    GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 ||
 	    psize <= 1 << spa->spa_min_ashift)) {
 		METASLABSTAT_BUMP(metaslabstat_try_hard);
 		try_hard = B_TRUE;
 		goto top;
 	}
 
 	memset(&dva[d], 0, sizeof (dva_t));
 
 	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
 	return (SET_ERROR(ENOSPC));
 }
 
 void
 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
     boolean_t checkpoint)
 {
 	metaslab_t *msp;
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY3U(offset, >=, msp->ms_start);
 	VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
 
 	metaslab_check_free_impl(vd, offset, asize);
 
 	mutex_enter(&msp->ms_lock);
 	if (zfs_range_tree_is_empty(msp->ms_freeing) &&
 	    zfs_range_tree_is_empty(msp->ms_checkpointing)) {
 		vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
 	}
 
 	if (checkpoint) {
 		ASSERT(spa_has_checkpoint(spa));
 		zfs_range_tree_add(msp->ms_checkpointing, offset, asize);
 	} else {
 		zfs_range_tree_add(msp->ms_freeing, offset, asize);
 	}
 	mutex_exit(&msp->ms_lock);
 }
 
 void
 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	(void) inner_offset;
 	boolean_t *checkpoint = arg;
 
 	ASSERT3P(checkpoint, !=, NULL);
 
 	if (vd->vdev_ops->vdev_op_remap != NULL)
 		vdev_indirect_mark_obsolete(vd, offset, size);
 	else
 		metaslab_free_impl(vd, offset, size, *checkpoint);
 }
 
 static void
 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
     boolean_t checkpoint)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
 		return;
 
 	if (spa->spa_vdev_removal != NULL &&
 	    spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
 	    vdev_is_concrete(vd)) {
 		/*
 		 * Note: we check if the vdev is concrete because when
 		 * we complete the removal, we first change the vdev to be
 		 * an indirect vdev (in open context), and then (in syncing
 		 * context) clear spa_vdev_removal.
 		 */
 		free_from_removing_vdev(vd, offset, size);
 	} else if (vd->vdev_ops->vdev_op_remap != NULL) {
 		vdev_indirect_mark_obsolete(vd, offset, size);
 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
 		    metaslab_free_impl_cb, &checkpoint);
 	} else {
 		metaslab_free_concrete(vd, offset, size, checkpoint);
 	}
 }
 
 typedef struct remap_blkptr_cb_arg {
 	blkptr_t *rbca_bp;
 	spa_remap_cb_t rbca_cb;
 	vdev_t *rbca_remap_vd;
 	uint64_t rbca_remap_offset;
 	void *rbca_cb_arg;
 } remap_blkptr_cb_arg_t;
 
 static void
 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	remap_blkptr_cb_arg_t *rbca = arg;
 	blkptr_t *bp = rbca->rbca_bp;
 
 	/* We can not remap split blocks. */
 	if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
 		return;
 	ASSERT0(inner_offset);
 
 	if (rbca->rbca_cb != NULL) {
 		/*
 		 * At this point we know that we are not handling split
 		 * blocks and we invoke the callback on the previous
 		 * vdev which must be indirect.
 		 */
 		ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
 
 		rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
 		    rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
 
 		/* set up remap_blkptr_cb_arg for the next call */
 		rbca->rbca_remap_vd = vd;
 		rbca->rbca_remap_offset = offset;
 	}
 
 	/*
 	 * The phys birth time is that of dva[0].  This ensures that we know
 	 * when each dva was written, so that resilver can determine which
 	 * blocks need to be scrubbed (i.e. those written during the time
 	 * the vdev was offline).  It also ensures that the key used in
 	 * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
 	 * we didn't change the phys_birth, a lookup in the ARC for a
 	 * remapped BP could find the data that was previously stored at
 	 * this vdev + offset.
 	 */
 	vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
 	    DVA_GET_VDEV(&bp->blk_dva[0]));
 	vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
 	uint64_t physical_birth = vdev_indirect_births_physbirth(vib,
 	    DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
 	BP_SET_PHYSICAL_BIRTH(bp, physical_birth);
 
 	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&bp->blk_dva[0], offset);
 }
 
 /*
  * If the block pointer contains any indirect DVAs, modify them to refer to
  * concrete DVAs.  Note that this will sometimes not be possible, leaving
  * the indirect DVA in place.  This happens if the indirect DVA spans multiple
  * segments in the mapping (i.e. it is a "split block").
  *
  * If the BP was remapped, calls the callback on the original dva (note the
  * callback can be called multiple times if the original indirect DVA refers
  * to another indirect DVA, etc).
  *
  * Returns TRUE if the BP was remapped.
  */
 boolean_t
 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
 {
 	remap_blkptr_cb_arg_t rbca;
 
 	if (!zfs_remap_blkptr_enable)
 		return (B_FALSE);
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
 		return (B_FALSE);
 
 	/*
 	 * Dedup BP's can not be remapped, because ddt_phys_select() depends
 	 * on DVA[0] being the same in the BP as in the DDT (dedup table).
 	 */
 	if (BP_GET_DEDUP(bp))
 		return (B_FALSE);
 
 	/*
 	 * Gang blocks can not be remapped, because
 	 * zio_checksum_gang_verifier() depends on the DVA[0] that's in
 	 * the BP used to read the gang block header (GBH) being the same
 	 * as the DVA[0] that we allocated for the GBH.
 	 */
 	if (BP_IS_GANG(bp))
 		return (B_FALSE);
 
 	/*
 	 * Embedded BP's have no DVA to remap.
 	 */
 	if (BP_GET_NDVAS(bp) < 1)
 		return (B_FALSE);
 
 	/*
 	 * Note: we only remap dva[0].  If we remapped other dvas, we
 	 * would no longer know what their phys birth txg is.
 	 */
 	dva_t *dva = &bp->blk_dva[0];
 
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
 
 	if (vd->vdev_ops->vdev_op_remap == NULL)
 		return (B_FALSE);
 
 	rbca.rbca_bp = bp;
 	rbca.rbca_cb = callback;
 	rbca.rbca_remap_vd = vd;
 	rbca.rbca_remap_offset = offset;
 	rbca.rbca_cb_arg = arg;
 
 	/*
 	 * remap_blkptr_cb() will be called in order for each level of
 	 * indirection, until a concrete vdev is reached or a split block is
 	 * encountered. old_vd and old_offset are updated within the callback
 	 * as we go from the one indirect vdev to the next one (either concrete
 	 * or indirect again) in that order.
 	 */
 	vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
 
 	/* Check if the DVA wasn't remapped because it is a split block */
 	if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 /*
  * Undo the allocation of a DVA which happened in the given transaction group.
  */
 void
 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 {
 	metaslab_t *msp;
 	vdev_t *vd;
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 
 	ASSERT(DVA_IS_VALID(dva));
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	if (txg > spa_freeze_txg(spa))
 		return;
 
 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) ||
 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
 		zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu",
 		    (u_longlong_t)vdev, (u_longlong_t)offset,
 		    (u_longlong_t)size);
 		return;
 	}
 
 	ASSERT(!vd->vdev_removing);
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
 	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
 
 	if (DVA_GET_GANG(dva))
 		size = vdev_gang_header_asize(vd);
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	mutex_enter(&msp->ms_lock);
 	zfs_range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
 	    offset, size);
 	msp->ms_allocating_total -= size;
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY3U(offset, >=, msp->ms_start);
 	VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
 	VERIFY3U(zfs_range_tree_space(msp->ms_allocatable) + size, <=,
 	    msp->ms_size);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 	zfs_range_tree_add(msp->ms_allocatable, offset, size);
 	mutex_exit(&msp->ms_lock);
 }
 
 /*
  * Free the block represented by the given DVA.
  */
 void
 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
 {
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd = vdev_lookup_top(spa, vdev);
 
 	ASSERT(DVA_IS_VALID(dva));
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	if (DVA_GET_GANG(dva)) {
 		size = vdev_gang_header_asize(vd);
 	}
 
 	metaslab_free_impl(vd, offset, size, checkpoint);
 }
 
 /*
  * Reserve some allocation slots. The reservation system must be called
  * before we call into the allocator. If there aren't any available slots
  * then the I/O will be throttled until an I/O completes and its slots are
  * freed up. The function returns true if it was successful in placing
  * the reservation.
  */
 boolean_t
 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
     zio_t *zio, int flags)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 	uint64_t max = mca->mca_alloc_max_slots;
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
 	if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) ||
 	    zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) {
 		/*
 		 * The potential race between _count() and _add() is covered
 		 * by the allocator lock in most cases, or irrelevant due to
 		 * GANG_ALLOCATION() or METASLAB_MUST_RESERVE set in others.
 		 * But even if we assume some other non-existing scenario, the
 		 * worst that can happen is few more I/Os get to allocation
 		 * earlier, that is not a problem.
 		 *
 		 * We reserve the slots individually so that we can unreserve
 		 * them individually when an I/O completes.
 		 */
 		zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio);
 		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 void
 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
     int allocator, zio_t *zio)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
 	zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio);
 }
 
 static int
 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
     uint64_t txg)
 {
 	metaslab_t *msp;
 	spa_t *spa = vd->vdev_spa;
 	int error = 0;
 
 	if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
 		return (SET_ERROR(ENXIO));
 
 	ASSERT3P(vd->vdev_ms, !=, NULL);
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	mutex_enter(&msp->ms_lock);
 
 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) {
 		error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
 		if (error == EBUSY) {
 			ASSERT(msp->ms_loaded);
 			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 			error = 0;
 		}
 	}
 
 	if (error == 0 &&
 	    !zfs_range_tree_contains(msp->ms_allocatable, offset, size))
 		error = SET_ERROR(ENOENT);
 
 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
 		mutex_exit(&msp->ms_lock);
 		return (error);
 	}
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 	VERIFY3U(zfs_range_tree_space(msp->ms_allocatable) - size, <=,
 	    msp->ms_size);
 	zfs_range_tree_remove(msp->ms_allocatable, offset, size);
 	zfs_range_tree_clear(msp->ms_trim, offset, size);
 
 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(8) */
 		metaslab_class_t *mc = msp->ms_group->mg_class;
 		multilist_sublist_t *mls =
 		    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 		if (!multilist_link_active(&msp->ms_class_txg_node)) {
 			msp->ms_selected_txg = txg;
 			multilist_sublist_insert_head(mls, msp);
 		}
 		multilist_sublist_unlock(mls);
 
 		if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
 		zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK],
 		    offset, size);
 		msp->ms_allocating_total += size;
 	}
 
 	mutex_exit(&msp->ms_lock);
 
 	return (0);
 }
 
 typedef struct metaslab_claim_cb_arg_t {
 	uint64_t	mcca_txg;
 	int		mcca_error;
 } metaslab_claim_cb_arg_t;
 
 static void
 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	(void) inner_offset;
 	metaslab_claim_cb_arg_t *mcca_arg = arg;
 
 	if (mcca_arg->mcca_error == 0) {
 		mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
 		    size, mcca_arg->mcca_txg);
 	}
 }
 
 int
 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
 {
 	if (vd->vdev_ops->vdev_op_remap != NULL) {
 		metaslab_claim_cb_arg_t arg;
 
 		/*
 		 * Only zdb(8) can claim on indirect vdevs.  This is used
 		 * to detect leaks of mapped space (that are not accounted
 		 * for in the obsolete counts, spacemap, or bpobj).
 		 */
 		ASSERT(!spa_writeable(vd->vdev_spa));
 		arg.mcca_error = 0;
 		arg.mcca_txg = txg;
 
 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
 		    metaslab_claim_impl_cb, &arg);
 
 		if (arg.mcca_error == 0) {
 			arg.mcca_error = metaslab_claim_concrete(vd,
 			    offset, size, txg);
 		}
 		return (arg.mcca_error);
 	} else {
 		return (metaslab_claim_concrete(vd, offset, size, txg));
 	}
 }
 
 /*
  * Intent log support: upon opening the pool after a crash, notify the SPA
  * of blocks that the intent log has allocated for immediate write, but
  * which are still considered free by the SPA because the last transaction
  * group didn't commit yet.
  */
 static int
 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 {
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd;
 
 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
 		return (SET_ERROR(ENXIO));
 	}
 
 	ASSERT(DVA_IS_VALID(dva));
 
 	if (DVA_GET_GANG(dva))
 		size = vdev_gang_header_asize(vd);
 
 	return (metaslab_claim_impl(vd, offset, size, txg));
 }
 
 int
 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
     zio_alloc_list_t *zal, zio_t *zio, int allocator)
 {
 	dva_t *dva = bp->blk_dva;
 	dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
 	int error = 0;
 
 	ASSERT0(BP_GET_LOGICAL_BIRTH(bp));
 	ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
 	if (mc->mc_allocator[allocator].mca_rotor == NULL) {
 		/* no vdevs in this class */
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
 	ASSERT(BP_GET_NDVAS(bp) == 0);
 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
 	ASSERT3P(zal, !=, NULL);
 
 	for (int d = 0; d < ndvas; d++) {
 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
 		    txg, flags, zal, allocator);
 		if (error != 0) {
 			for (d--; d >= 0; d--) {
 				metaslab_unalloc_dva(spa, &dva[d], txg);
 				metaslab_group_alloc_decrement(spa,
 				    DVA_GET_VDEV(&dva[d]), zio, flags,
 				    allocator, B_FALSE);
 				memset(&dva[d], 0, sizeof (dva_t));
 			}
 			spa_config_exit(spa, SCL_ALLOC, FTAG);
 			return (error);
 		} else {
 			/*
 			 * Update the metaslab group's queue depth
 			 * based on the newly allocated dva.
 			 */
 			metaslab_group_alloc_increment(spa,
 			    DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
 		}
 	}
 	ASSERT(error == 0);
 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
 
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	BP_SET_BIRTH(bp, txg, 0);
 
 	return (0);
 }
 
 void
 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
 
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa));
 
 	/*
 	 * If we have a checkpoint for the pool we need to make sure that
 	 * the blocks that we free that are part of the checkpoint won't be
 	 * reused until the checkpoint is discarded or we revert to it.
 	 *
 	 * The checkpoint flag is passed down the metaslab_free code path
 	 * and is set whenever we want to add a block to the checkpoint's
 	 * accounting. That is, we "checkpoint" blocks that existed at the
 	 * time the checkpoint was created and are therefore referenced by
 	 * the checkpointed uberblock.
 	 *
 	 * Note that, we don't checkpoint any blocks if the current
 	 * syncing txg <= spa_checkpoint_txg. We want these frees to sync
 	 * normally as they will be referenced by the checkpointed uberblock.
 	 */
 	boolean_t checkpoint = B_FALSE;
 	if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg &&
 	    spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
 		/*
 		 * At this point, if the block is part of the checkpoint
 		 * there is no way it was created in the current txg.
 		 */
 		ASSERT(!now);
 		ASSERT3U(spa_syncing_txg(spa), ==, txg);
 		checkpoint = B_TRUE;
 	}
 
 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
 
 	for (int d = 0; d < ndvas; d++) {
 		if (now) {
 			metaslab_unalloc_dva(spa, &dva[d], txg);
 		} else {
 			ASSERT3U(txg, ==, spa_syncing_txg(spa));
 			metaslab_free_dva(spa, &dva[d], checkpoint);
 		}
 	}
 
 	spa_config_exit(spa, SCL_FREE, FTAG);
 }
 
 int
 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
 	int error = 0;
 
 	ASSERT(!BP_IS_HOLE(bp));
 
 	if (txg != 0) {
 		/*
 		 * First do a dry run to make sure all DVAs are claimable,
 		 * so we don't have to unwind from partial failures below.
 		 */
 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
 			return (error);
 	}
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
 	for (int d = 0; d < ndvas; d++) {
 		error = metaslab_claim_dva(spa, &dva[d], txg);
 		if (error != 0)
 			break;
 	}
 
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	ASSERT(error == 0 || txg == 0);
 
 	return (error);
 }
 
 static void
 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	(void) inner, (void) arg;
 
 	if (vd->vdev_ops == &vdev_indirect_ops)
 		return;
 
 	metaslab_check_free_impl(vd, offset, size);
 }
 
 static void
 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
 {
 	metaslab_t *msp;
 	spa_t *spa __maybe_unused = vd->vdev_spa;
 
 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
 		return;
 
 	if (vd->vdev_ops->vdev_op_remap != NULL) {
 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
 		    metaslab_check_free_impl_cb, NULL);
 		return;
 	}
 
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	mutex_enter(&msp->ms_lock);
 	if (msp->ms_loaded) {
 		zfs_range_tree_verify_not_present(msp->ms_allocatable,
 		    offset, size);
 	}
 
 	/*
 	 * Check all segments that currently exist in the freeing pipeline.
 	 *
 	 * It would intuitively make sense to also check the current allocating
 	 * tree since metaslab_unalloc_dva() exists for extents that are
 	 * allocated and freed in the same sync pass within the same txg.
 	 * Unfortunately there are places (e.g. the ZIL) where we allocate a
 	 * segment but then we free part of it within the same txg
 	 * [see zil_sync()]. Thus, we don't call zfs_range_tree_verify() in the
 	 * current allocating tree.
 	 */
 	zfs_range_tree_verify_not_present(msp->ms_freeing, offset, size);
 	zfs_range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
 	zfs_range_tree_verify_not_present(msp->ms_freed, offset, size);
 	for (int j = 0; j < TXG_DEFER_SIZE; j++)
 		zfs_range_tree_verify_not_present(msp->ms_defer[j], offset,
 		    size);
 	zfs_range_tree_verify_not_present(msp->ms_trim, offset, size);
 	mutex_exit(&msp->ms_lock);
 }
 
 void
 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
 {
 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
 		return;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 		vdev_t *vd = vdev_lookup_top(spa, vdev);
 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
 
 		if (DVA_GET_GANG(&bp->blk_dva[i]))
 			size = vdev_gang_header_asize(vd);
 
 		ASSERT3P(vd, !=, NULL);
 
 		metaslab_check_free_impl(vd, offset, size);
 	}
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 }
 
 static void
 metaslab_group_disable_wait(metaslab_group_t *mg)
 {
 	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
 	while (mg->mg_disabled_updating) {
 		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
 	}
 }
 
 static void
 metaslab_group_disabled_increment(metaslab_group_t *mg)
 {
 	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
 	ASSERT(mg->mg_disabled_updating);
 
 	while (mg->mg_ms_disabled >= max_disabled_ms) {
 		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
 	}
 	mg->mg_ms_disabled++;
 	ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
 }
 
 /*
  * Mark the metaslab as disabled to prevent any allocations on this metaslab.
  * We must also track how many metaslabs are currently disabled within a
  * metaslab group and limit them to prevent allocation failures from
  * occurring because all metaslabs are disabled.
  */
 void
 metaslab_disable(metaslab_t *msp)
 {
 	ASSERT(!MUTEX_HELD(&msp->ms_lock));
 	metaslab_group_t *mg = msp->ms_group;
 
 	mutex_enter(&mg->mg_ms_disabled_lock);
 
 	/*
 	 * To keep an accurate count of how many threads have disabled
 	 * a specific metaslab group, we only allow one thread to mark
 	 * the metaslab group at a time. This ensures that the value of
 	 * ms_disabled will be accurate when we decide to mark a metaslab
 	 * group as disabled. To do this we force all other threads
 	 * to wait till the metaslab's mg_disabled_updating flag is no
 	 * longer set.
 	 */
 	metaslab_group_disable_wait(mg);
 	mg->mg_disabled_updating = B_TRUE;
 	if (msp->ms_disabled == 0) {
 		metaslab_group_disabled_increment(mg);
 	}
 	mutex_enter(&msp->ms_lock);
 	msp->ms_disabled++;
 	mutex_exit(&msp->ms_lock);
 
 	mg->mg_disabled_updating = B_FALSE;
 	cv_broadcast(&mg->mg_ms_disabled_cv);
 	mutex_exit(&mg->mg_ms_disabled_lock);
 }
 
 void
 metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	spa_t *spa = mg->mg_vd->vdev_spa;
 
 	/*
 	 * Wait for the outstanding IO to be synced to prevent newly
 	 * allocated blocks from being overwritten.  This used by
 	 * initialize and TRIM which are modifying unallocated space.
 	 */
 	if (sync)
 		txg_wait_synced(spa_get_dsl(spa), 0);
 
 	mutex_enter(&mg->mg_ms_disabled_lock);
 	mutex_enter(&msp->ms_lock);
 	if (--msp->ms_disabled == 0) {
 		mg->mg_ms_disabled--;
 		cv_broadcast(&mg->mg_ms_disabled_cv);
 		if (unload)
 			metaslab_unload(msp);
 	}
 	mutex_exit(&msp->ms_lock);
 	mutex_exit(&mg->mg_ms_disabled_lock);
 }
 
 void
 metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty)
 {
 	ms->ms_unflushed_dirty = dirty;
 }
 
 static void
 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
 {
 	vdev_t *vd = ms->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
 
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	metaslab_unflushed_phys_t entry = {
 		.msp_unflushed_txg = metaslab_unflushed_txg(ms),
 	};
 	uint64_t entry_size = sizeof (entry);
 	uint64_t entry_offset = ms->ms_id * entry_size;
 
 	uint64_t object = 0;
 	int err = zap_lookup(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
 	    &object);
 	if (err == ENOENT) {
 		object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
 		    SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
 		VERIFY0(zap_add(mos, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
 		    &object, tx));
 	} else {
 		VERIFY0(err);
 	}
 
 	dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
 	    &entry, tx);
 }
 
 void
 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
 {
 	ms->ms_unflushed_txg = txg;
 	metaslab_update_ondisk_flush_data(ms, tx);
 }
 
 boolean_t
 metaslab_unflushed_dirty(metaslab_t *ms)
 {
 	return (ms->ms_unflushed_dirty);
 }
 
 uint64_t
 metaslab_unflushed_txg(metaslab_t *ms)
 {
 	return (ms->ms_unflushed_txg);
 }
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW,
 	"Allocation granularity (a.k.a. stripe size)");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW,
 	"Load all metaslabs when pool is first opened");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW,
 	"Prevent metaslabs from being unloaded");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW,
 	"Preload potential metaslabs during reassessment");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW,
 	"Max number of metaslabs per group to preload");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW,
 	"Delay in txgs after metaslab was last used before unloading");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW,
 	"Delay in milliseconds after metaslab was last used before unloading");
 
 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW,
 	"Percentage of metaslab group size that should be free to make it "
 	"eligible for allocation");
 
 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW,
 	"Percentage of metaslab group size that should be considered eligible "
 	"for allocations unless all metaslab groups within the metaslab class "
 	"have also crossed this threshold");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT,
 	ZMOD_RW,
 	"Use the fragmentation metric to prefer less fragmented metaslabs");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT,
 	ZMOD_RW, "Fragmentation for metaslab to allow allocation");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW,
 	"Prefer metaslabs with lower LBAs");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW,
 	"Enable metaslab group biasing");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT,
 	ZMOD_RW, "Enable segment-based metaslab selection");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
 	"Segment-based metaslab selection maximum buckets before switching");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW,
 	"Blocks larger than this size are sometimes forced to be gang blocks");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW,
 	"Percentage of large blocks that will be forced to be gang blocks");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW,
 	"Max distance (bytes) to search forward before using size tree");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW,
 	"When looking in size tree, use largest segment instead of exact fit");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64,
 	ZMOD_RW, "How long to trust the cached max chunk size of a metaslab");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW,
 	"Percentage of memory that can be used to store metaslab range trees");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT,
 	ZMOD_RW, "Try hard to allocate before ganging");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW,
 	"Normally only consider this many of the best metaslabs in each vdev");
 
 ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator,
 	param_set_active_allocator, param_get_charp, ZMOD_RW,
 	"SPA active allocator");
diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c
index 3cbd5712e1d3..8bb9a0724e61 100644
--- a/module/zfs/range_tree.c
+++ b/module/zfs/range_tree.c
@@ -1,875 +1,875 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
  * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/dnode.h>
 #include <sys/zio.h>
 #include <sys/range_tree.h>
 
 /*
  * Range trees are tree-based data structures that can be used to
  * track free space or generally any space allocation information.
  * A range tree keeps track of individual segments and automatically
  * provides facilities such as adjacent extent merging and extent
  * splitting in response to range add/remove requests.
  *
  * A range tree starts out completely empty, with no segments in it.
  * Adding an allocation via zfs_range_tree_add to the range tree can either:
  * 1) create a new extent
  * 2) extend an adjacent extent
  * 3) merge two adjacent extents
  * Conversely, removing an allocation via zfs_range_tree_remove can:
  * 1) completely remove an extent
  * 2) shorten an extent (if the allocation was near one of its ends)
  * 3) split an extent into two extents, in effect punching a hole
  *
  * A range tree is also capable of 'bridging' gaps when adding
  * allocations. This is useful for cases when close proximity of
  * allocations is an important detail that needs to be represented
  * in the range tree. See zfs_range_tree_set_gap(). The default behavior
  * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
  *
  * In order to traverse a range tree, use either the zfs_range_tree_walk()
  * or zfs_range_tree_vacate() functions.
  *
  * To obtain more accurate information on individual segment
  * operations that the range tree performs "under the hood", you can
  * specify a set of callbacks by passing a zfs_range_tree_ops_t structure
  * to the zfs_range_tree_create function. Any callbacks that are non-NULL
  * are then called at the appropriate times.
  *
  * The range tree code also supports a special variant of range trees
  * that can bridge small gaps between segments. This kind of tree is used
  * by the dsl scanning code to group I/Os into mostly sequential chunks to
  * optimize disk performance. The code here attempts to do this with as
  * little memory and computational overhead as possible. One limitation of
  * this implementation is that segments of range trees with gaps can only
  * support removing complete segments.
  */
 
 static inline void
 zfs_rs_copy(zfs_range_seg_t *src, zfs_range_seg_t *dest, zfs_range_tree_t *rt)
 {
 	ASSERT3U(rt->rt_type, <, ZFS_RANGE_SEG_NUM_TYPES);
 	size_t size = 0;
 	switch (rt->rt_type) {
 	case ZFS_RANGE_SEG32:
-		size = sizeof (range_seg32_t);
+		size = sizeof (zfs_range_seg32_t);
 		break;
 	case ZFS_RANGE_SEG64:
-		size = sizeof (range_seg64_t);
+		size = sizeof (zfs_range_seg64_t);
 		break;
 	case ZFS_RANGE_SEG_GAP:
-		size = sizeof (range_seg_gap_t);
+		size = sizeof (zfs_range_seg_gap_t);
 		break;
 	default:
 		__builtin_unreachable();
 	}
 	memcpy(dest, src, size);
 }
 
 void
 zfs_range_tree_stat_verify(zfs_range_tree_t *rt)
 {
 	zfs_range_seg_t *rs;
 	zfs_btree_index_t where;
-	uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
+	uint64_t hist[ZFS_RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
 	int i;
 
 	for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL;
 	    rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
 		uint64_t size = zfs_rs_get_end(rs, rt) -
 		    zfs_rs_get_start(rs, rt);
 		int idx	= highbit64(size) - 1;
 
 		hist[idx]++;
 		ASSERT3U(hist[idx], !=, 0);
 	}
 
-	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+	for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
 		if (hist[i] != rt->rt_histogram[i]) {
 			zfs_dbgmsg("i=%d, hist=%px, hist=%llu, rt_hist=%llu",
 			    i, hist, (u_longlong_t)hist[i],
 			    (u_longlong_t)rt->rt_histogram[i]);
 		}
 		VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
 	}
 }
 
 static void
 zfs_range_tree_stat_incr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
 {
 	uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
 	int idx = highbit64(size) - 1;
 
 	ASSERT(size != 0);
 	ASSERT3U(idx, <,
 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
 
 	rt->rt_histogram[idx]++;
 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
 }
 
 static void
 zfs_range_tree_stat_decr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
 {
 	uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
 	int idx = highbit64(size) - 1;
 
 	ASSERT(size != 0);
 	ASSERT3U(idx, <,
 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
 
 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
 	rt->rt_histogram[idx]--;
 }
 
 __attribute__((always_inline)) inline
 static int
 zfs_range_tree_seg32_compare(const void *x1, const void *x2)
 {
-	const range_seg32_t *r1 = x1;
-	const range_seg32_t *r2 = x2;
+	const zfs_range_seg32_t *r1 = x1;
+	const zfs_range_seg32_t *r2 = x2;
 
 	ASSERT3U(r1->rs_start, <=, r1->rs_end);
 	ASSERT3U(r2->rs_start, <=, r2->rs_end);
 
 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
 }
 
 __attribute__((always_inline)) inline
 static int
 zfs_range_tree_seg64_compare(const void *x1, const void *x2)
 {
-	const range_seg64_t *r1 = x1;
-	const range_seg64_t *r2 = x2;
+	const zfs_range_seg64_t *r1 = x1;
+	const zfs_range_seg64_t *r2 = x2;
 
 	ASSERT3U(r1->rs_start, <=, r1->rs_end);
 	ASSERT3U(r2->rs_start, <=, r2->rs_end);
 
 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
 }
 
 __attribute__((always_inline)) inline
 static int
 zfs_range_tree_seg_gap_compare(const void *x1, const void *x2)
 {
-	const range_seg_gap_t *r1 = x1;
-	const range_seg_gap_t *r2 = x2;
+	const zfs_range_seg_gap_t *r1 = x1;
+	const zfs_range_seg_gap_t *r2 = x2;
 
 	ASSERT3U(r1->rs_start, <=, r1->rs_end);
 	ASSERT3U(r2->rs_start, <=, r2->rs_end);
 
 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
 }
 
-ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf, range_seg32_t,
+ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf, zfs_range_seg32_t,
     zfs_range_tree_seg32_compare)
 
-ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, range_seg64_t,
+ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t,
     zfs_range_tree_seg64_compare)
 
-ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf, range_seg_gap_t,
-    zfs_range_tree_seg_gap_compare)
+ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf,
+    zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare)
 
 zfs_range_tree_t *
 zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
     uint64_t gap)
 {
 	zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP);
 
 	ASSERT3U(shift, <, 64);
 	ASSERT3U(type, <=, ZFS_RANGE_SEG_NUM_TYPES);
 	size_t size;
 	int (*compare) (const void *, const void *);
 	bt_find_in_buf_f bt_find;
 	switch (type) {
 	case ZFS_RANGE_SEG32:
-		size = sizeof (range_seg32_t);
+		size = sizeof (zfs_range_seg32_t);
 		compare = zfs_range_tree_seg32_compare;
 		bt_find = zfs_range_tree_seg32_find_in_buf;
 		break;
 	case ZFS_RANGE_SEG64:
-		size = sizeof (range_seg64_t);
+		size = sizeof (zfs_range_seg64_t);
 		compare = zfs_range_tree_seg64_compare;
 		bt_find = zfs_range_tree_seg64_find_in_buf;
 		break;
 	case ZFS_RANGE_SEG_GAP:
-		size = sizeof (range_seg_gap_t);
+		size = sizeof (zfs_range_seg_gap_t);
 		compare = zfs_range_tree_seg_gap_compare;
 		bt_find = zfs_range_tree_seg_gap_find_in_buf;
 		break;
 	default:
 		panic("Invalid range seg type %d", type);
 	}
 	zfs_btree_create(&rt->rt_root, compare, bt_find, size);
 
 	rt->rt_ops = ops;
 	rt->rt_gap = gap;
 	rt->rt_arg = arg;
 	rt->rt_type = type;
 	rt->rt_start = start;
 	rt->rt_shift = shift;
 
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
 		rt->rt_ops->rtop_create(rt, rt->rt_arg);
 
 	return (rt);
 }
 
 zfs_range_tree_t *
 zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift)
 {
 	return (zfs_range_tree_create_gap(ops, type, arg, start, shift, 0));
 }
 
 void
 zfs_range_tree_destroy(zfs_range_tree_t *rt)
 {
 	VERIFY0(rt->rt_space);
 
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
 		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
 
 	zfs_btree_destroy(&rt->rt_root);
 	kmem_free(rt, sizeof (*rt));
 }
 
 void
 zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
     int64_t delta)
 {
 	if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) {
 		zfs_panic_recover("zfs: attempting to decrease fill to or "
 		    "below 0; probable double remove in segment [%llx:%llx]",
 		    (longlong_t)zfs_rs_get_start(rs, rt),
 		    (longlong_t)zfs_rs_get_end(rs, rt));
 	}
 	if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) -
 	    zfs_rs_get_start(rs, rt)) {
 		zfs_panic_recover("zfs: attempting to increase fill beyond "
 		    "max; probable double add in segment [%llx:%llx]",
 		    (longlong_t)zfs_rs_get_start(rs, rt),
 		    (longlong_t)zfs_rs_get_end(rs, rt));
 	}
 
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
 	zfs_rs_set_fill(rs, rt, zfs_rs_get_fill(rs, rt) + delta);
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
 }
 
 static void
 zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
 {
 	zfs_range_tree_t *rt = arg;
 	zfs_btree_index_t where;
 	zfs_range_seg_t *rs_before, *rs_after, *rs;
-	range_seg_max_t tmp, rsearch;
+	zfs_range_seg_max_t tmp, rsearch;
 	uint64_t end = start + size, gap = rt->rt_gap;
 	uint64_t bridge_size = 0;
 	boolean_t merge_before, merge_after;
 
 	ASSERT3U(size, !=, 0);
 	ASSERT3U(fill, <=, size);
 	ASSERT3U(start + size, >, start);
 
 	zfs_rs_set_start(&rsearch, rt, start);
 	zfs_rs_set_end(&rsearch, rt, end);
 	rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
 
 	/*
 	 * If this is a gap-supporting range tree, it is possible that we
 	 * are inserting into an existing segment. In this case simply
 	 * bump the fill count and call the remove / add callbacks. If the
 	 * new range will extend an existing segment, we remove the
 	 * existing one, apply the new extent to it and re-insert it using
 	 * the normal code paths.
 	 */
 	if (rs != NULL) {
 		if (gap == 0) {
 			zfs_panic_recover("zfs: adding existent segment to "
 			    "range tree (offset=%llx size=%llx)",
 			    (longlong_t)start, (longlong_t)size);
 			return;
 		}
 		uint64_t rstart = zfs_rs_get_start(rs, rt);
 		uint64_t rend = zfs_rs_get_end(rs, rt);
 		if (rstart <= start && rend >= end) {
 			zfs_range_tree_adjust_fill(rt, rs, fill);
 			return;
 		}
 
 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 			rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
 
 		zfs_range_tree_stat_decr(rt, rs);
 		rt->rt_space -= rend - rstart;
 
 		fill += zfs_rs_get_fill(rs, rt);
 		start = MIN(start, rstart);
 		end = MAX(end, rend);
 		size = end - start;
 
 		zfs_btree_remove(&rt->rt_root, rs);
 		zfs_range_tree_add_impl(rt, start, size, fill);
 		return;
 	}
 
 	ASSERT3P(rs, ==, NULL);
 
 	/*
 	 * Determine whether or not we will have to merge with our neighbors.
 	 * If gap != 0, we might need to merge with our neighbors even if we
 	 * aren't directly touching.
 	 */
 	zfs_btree_index_t where_before, where_after;
 	rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before);
 	rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after);
 
 	merge_before = (rs_before != NULL && zfs_rs_get_end(rs_before, rt) >=
 	    start - gap);
 	merge_after = (rs_after != NULL && zfs_rs_get_start(rs_after, rt) <=
 	    end + gap);
 
 	if (merge_before && gap != 0)
 		bridge_size += start - zfs_rs_get_end(rs_before, rt);
 	if (merge_after && gap != 0)
 		bridge_size += zfs_rs_get_start(rs_after, rt) - end;
 
 	if (merge_before && merge_after) {
 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
 		}
 
 		zfs_range_tree_stat_decr(rt, rs_before);
 		zfs_range_tree_stat_decr(rt, rs_after);
 
 		zfs_rs_copy(rs_after, &tmp, rt);
 		uint64_t before_start = zfs_rs_get_start_raw(rs_before, rt);
 		uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
 		uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
 		zfs_btree_remove_idx(&rt->rt_root, &where_before);
 
 		/*
 		 * We have to re-find the node because our old reference is
 		 * invalid as soon as we do any mutating btree operations.
 		 */
 		rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after);
 		ASSERT3P(rs_after, !=, NULL);
 		zfs_rs_set_start_raw(rs_after, rt, before_start);
 		zfs_rs_set_fill(rs_after, rt, after_fill + before_fill + fill);
 		rs = rs_after;
 	} else if (merge_before) {
 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
 
 		zfs_range_tree_stat_decr(rt, rs_before);
 
 		uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
 		zfs_rs_set_end(rs_before, rt, end);
 		zfs_rs_set_fill(rs_before, rt, before_fill + fill);
 		rs = rs_before;
 	} else if (merge_after) {
 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
 
 		zfs_range_tree_stat_decr(rt, rs_after);
 
 		uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
 		zfs_rs_set_start(rs_after, rt, start);
 		zfs_rs_set_fill(rs_after, rt, after_fill + fill);
 		rs = rs_after;
 	} else {
 		rs = &tmp;
 
 		zfs_rs_set_start(rs, rt, start);
 		zfs_rs_set_end(rs, rt, end);
 		zfs_rs_set_fill(rs, rt, fill);
 		zfs_btree_add_idx(&rt->rt_root, rs, &where);
 	}
 
 	if (gap != 0) {
 		ASSERT3U(zfs_rs_get_fill(rs, rt), <=, zfs_rs_get_end(rs, rt) -
 		    zfs_rs_get_start(rs, rt));
 	} else {
 		ASSERT3U(zfs_rs_get_fill(rs, rt), ==, zfs_rs_get_end(rs, rt) -
 		    zfs_rs_get_start(rs, rt));
 	}
 
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
 
 	zfs_range_tree_stat_incr(rt, rs);
 	rt->rt_space += size + bridge_size;
 }
 
 void
 zfs_range_tree_add(void *arg, uint64_t start, uint64_t size)
 {
 	zfs_range_tree_add_impl(arg, start, size, size);
 }
 
 static void
 zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
     boolean_t do_fill)
 {
 	zfs_btree_index_t where;
 	zfs_range_seg_t *rs;
-	range_seg_max_t rsearch, rs_tmp;
+	zfs_range_seg_max_t rsearch, rs_tmp;
 	uint64_t end = start + size;
 	boolean_t left_over, right_over;
 
 	VERIFY3U(size, !=, 0);
 	VERIFY3U(size, <=, rt->rt_space);
 	if (rt->rt_type == ZFS_RANGE_SEG64)
 		ASSERT3U(start + size, >, start);
 
 	zfs_rs_set_start(&rsearch, rt, start);
 	zfs_rs_set_end(&rsearch, rt, end);
 	rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
 
 	/* Make sure we completely overlap with someone */
 	if (rs == NULL) {
 		zfs_panic_recover("zfs: removing nonexistent segment from "
 		    "range tree (offset=%llx size=%llx)",
 		    (longlong_t)start, (longlong_t)size);
 		return;
 	}
 
 	/*
 	 * Range trees with gap support must only remove complete segments
 	 * from the tree. This allows us to maintain accurate fill accounting
 	 * and to ensure that bridged sections are not leaked. If we need to
 	 * remove less than the full segment, we can only adjust the fill count.
 	 */
 	if (rt->rt_gap != 0) {
 		if (do_fill) {
 			if (zfs_rs_get_fill(rs, rt) == size) {
 				start = zfs_rs_get_start(rs, rt);
 				end = zfs_rs_get_end(rs, rt);
 				size = end - start;
 			} else {
 				zfs_range_tree_adjust_fill(rt, rs, -size);
 				return;
 			}
 		} else if (zfs_rs_get_start(rs, rt) != start ||
 		    zfs_rs_get_end(rs, rt) != end) {
 			zfs_panic_recover("zfs: freeing partial segment of "
 			    "gap tree (offset=%llx size=%llx) of "
 			    "(offset=%llx size=%llx)",
 			    (longlong_t)start, (longlong_t)size,
 			    (longlong_t)zfs_rs_get_start(rs, rt),
 			    (longlong_t)zfs_rs_get_end(rs, rt) -
 			    zfs_rs_get_start(rs, rt));
 			return;
 		}
 	}
 
 	VERIFY3U(zfs_rs_get_start(rs, rt), <=, start);
 	VERIFY3U(zfs_rs_get_end(rs, rt), >=, end);
 
 	left_over = (zfs_rs_get_start(rs, rt) != start);
 	right_over = (zfs_rs_get_end(rs, rt) != end);
 
 	zfs_range_tree_stat_decr(rt, rs);
 
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
 
 	if (left_over && right_over) {
-		range_seg_max_t newseg;
+		zfs_range_seg_max_t newseg;
 		zfs_rs_set_start(&newseg, rt, end);
 		zfs_rs_set_end_raw(&newseg, rt, zfs_rs_get_end_raw(rs, rt));
 		zfs_rs_set_fill(&newseg, rt, zfs_rs_get_end(rs, rt) - end);
 		zfs_range_tree_stat_incr(rt, &newseg);
 
 		// This modifies the buffer already inside the range tree
 		zfs_rs_set_end(rs, rt, start);
 
 		zfs_rs_copy(rs, &rs_tmp, rt);
 		if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL)
 			zfs_btree_add_idx(&rt->rt_root, &newseg, &where);
 		else
 			zfs_btree_add(&rt->rt_root, &newseg);
 
 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
 			rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg);
 	} else if (left_over) {
 		// This modifies the buffer already inside the range tree
 		zfs_rs_set_end(rs, rt, start);
 		zfs_rs_copy(rs, &rs_tmp, rt);
 	} else if (right_over) {
 		// This modifies the buffer already inside the range tree
 		zfs_rs_set_start(rs, rt, end);
 		zfs_rs_copy(rs, &rs_tmp, rt);
 	} else {
 		zfs_btree_remove_idx(&rt->rt_root, &where);
 		rs = NULL;
 	}
 
 	if (rs != NULL) {
 		/*
 		 * The fill of the leftover segment will always be equal to
 		 * the size, since we do not support removing partial segments
 		 * of range trees with gaps.
 		 */
 		zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) -
 		    zfs_rs_get_start_raw(rs, rt));
 		zfs_range_tree_stat_incr(rt, &rs_tmp);
 
 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
 			rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg);
 	}
 
 	rt->rt_space -= size;
 }
 
 void
 zfs_range_tree_remove(void *arg, uint64_t start, uint64_t size)
 {
 	zfs_range_tree_remove_impl(arg, start, size, B_FALSE);
 }
 
 void
 zfs_range_tree_remove_fill(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
 {
 	zfs_range_tree_remove_impl(rt, start, size, B_TRUE);
 }
 
 void
 zfs_range_tree_resize_segment(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
     uint64_t newstart, uint64_t newsize)
 {
 	int64_t delta = newsize - (zfs_rs_get_end(rs, rt) -
 	    zfs_rs_get_start(rs, rt));
 
 	zfs_range_tree_stat_decr(rt, rs);
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
 
 	zfs_rs_set_start(rs, rt, newstart);
 	zfs_rs_set_end(rs, rt, newstart + newsize);
 
 	zfs_range_tree_stat_incr(rt, rs);
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
 
 	rt->rt_space += delta;
 }
 
 static zfs_range_seg_t *
 zfs_range_tree_find_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
 {
-	range_seg_max_t rsearch;
+	zfs_range_seg_max_t rsearch;
 	uint64_t end = start + size;
 
 	VERIFY(size != 0);
 
 	zfs_rs_set_start(&rsearch, rt, start);
 	zfs_rs_set_end(&rsearch, rt, end);
 	return (zfs_btree_find(&rt->rt_root, &rsearch, NULL));
 }
 
 zfs_range_seg_t *
 zfs_range_tree_find(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
 {
 	if (rt->rt_type == ZFS_RANGE_SEG64)
 		ASSERT3U(start + size, >, start);
 
 	zfs_range_seg_t *rs = zfs_range_tree_find_impl(rt, start, size);
 	if (rs != NULL && zfs_rs_get_start(rs, rt) <= start &&
 	    zfs_rs_get_end(rs, rt) >= start + size) {
 		return (rs);
 	}
 	return (NULL);
 }
 
 void
 zfs_range_tree_verify_not_present(zfs_range_tree_t *rt, uint64_t off,
     uint64_t size)
 {
 	zfs_range_seg_t *rs = zfs_range_tree_find(rt, off, size);
 	if (rs != NULL)
 		panic("segment already in tree; rs=%p", (void *)rs);
 }
 
 boolean_t
 zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
 {
 	return (zfs_range_tree_find(rt, start, size) != NULL);
 }
 
 /*
  * Returns the first subset of the given range which overlaps with the range
  * tree. Returns true if there is a segment in the range, and false if there
  * isn't.
  */
 boolean_t
 zfs_range_tree_find_in(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
     uint64_t *ostart, uint64_t *osize)
 {
 	if (rt->rt_type == ZFS_RANGE_SEG64)
 		ASSERT3U(start + size, >, start);
 
-	range_seg_max_t rsearch;
+	zfs_range_seg_max_t rsearch;
 	zfs_rs_set_start(&rsearch, rt, start);
 	zfs_rs_set_end_raw(&rsearch, rt, zfs_rs_get_start_raw(&rsearch, rt) +
 	    1);
 
 	zfs_btree_index_t where;
 	zfs_range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
 	if (rs != NULL) {
 		*ostart = start;
 		*osize = MIN(size, zfs_rs_get_end(rs, rt) - start);
 		return (B_TRUE);
 	}
 
 	rs = zfs_btree_next(&rt->rt_root, &where, &where);
 	if (rs == NULL || zfs_rs_get_start(rs, rt) > start + size)
 		return (B_FALSE);
 
 	*ostart = zfs_rs_get_start(rs, rt);
 	*osize = MIN(start + size, zfs_rs_get_end(rs, rt)) -
 	    zfs_rs_get_start(rs, rt);
 	return (B_TRUE);
 }
 
 /*
  * Ensure that this range is not in the tree, regardless of whether
  * it is currently in the tree.
  */
 void
 zfs_range_tree_clear(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
 {
 	zfs_range_seg_t *rs;
 
 	if (size == 0)
 		return;
 
 	if (rt->rt_type == ZFS_RANGE_SEG64)
 		ASSERT3U(start + size, >, start);
 
 	while ((rs = zfs_range_tree_find_impl(rt, start, size)) != NULL) {
 		uint64_t free_start = MAX(zfs_rs_get_start(rs, rt), start);
 		uint64_t free_end = MIN(zfs_rs_get_end(rs, rt), start + size);
 		zfs_range_tree_remove(rt, free_start, free_end - free_start);
 	}
 }
 
 void
 zfs_range_tree_swap(zfs_range_tree_t **rtsrc, zfs_range_tree_t **rtdst)
 {
 	zfs_range_tree_t *rt;
 
 	ASSERT0(zfs_range_tree_space(*rtdst));
 	ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root));
 
 	rt = *rtsrc;
 	*rtsrc = *rtdst;
 	*rtdst = rt;
 }
 
 void
 zfs_range_tree_vacate(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
     void *arg)
 {
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
 		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
 
 	if (func != NULL) {
 		zfs_range_seg_t *rs;
 		zfs_btree_index_t *cookie = NULL;
 
 		while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) !=
 		    NULL) {
 			func(arg, zfs_rs_get_start(rs, rt),
 			    zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt));
 		}
 	} else {
 		zfs_btree_clear(&rt->rt_root);
 	}
 
 	memset(rt->rt_histogram, 0, sizeof (rt->rt_histogram));
 	rt->rt_space = 0;
 }
 
 void
 zfs_range_tree_walk(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
     void *arg)
 {
 	zfs_btree_index_t where;
 	for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where);
 	    rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
 		func(arg, zfs_rs_get_start(rs, rt), zfs_rs_get_end(rs, rt) -
 		    zfs_rs_get_start(rs, rt));
 	}
 }
 
 zfs_range_seg_t *
 zfs_range_tree_first(zfs_range_tree_t *rt)
 {
 	return (zfs_btree_first(&rt->rt_root, NULL));
 }
 
 uint64_t
 zfs_range_tree_space(zfs_range_tree_t *rt)
 {
 	return (rt->rt_space);
 }
 
 uint64_t
 zfs_range_tree_numsegs(zfs_range_tree_t *rt)
 {
 	return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root));
 }
 
 boolean_t
 zfs_range_tree_is_empty(zfs_range_tree_t *rt)
 {
 	ASSERT(rt != NULL);
 	return (zfs_range_tree_space(rt) == 0);
 }
 
 /*
  * Remove any overlapping ranges between the given segment [start, end)
  * from removefrom. Add non-overlapping leftovers to addto.
  */
 void
 zfs_range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
     zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
 {
 	zfs_btree_index_t where;
-	range_seg_max_t starting_rs;
+	zfs_range_seg_max_t starting_rs;
 	zfs_rs_set_start(&starting_rs, removefrom, start);
 	zfs_rs_set_end_raw(&starting_rs, removefrom,
 	    zfs_rs_get_start_raw(&starting_rs, removefrom) + 1);
 
 	zfs_range_seg_t *curr = zfs_btree_find(&removefrom->rt_root,
 	    &starting_rs, &where);
 
 	if (curr == NULL)
 		curr = zfs_btree_next(&removefrom->rt_root, &where, &where);
 
 	zfs_range_seg_t *next;
 	for (; curr != NULL; curr = next) {
 		if (start == end)
 			return;
 		VERIFY3U(start, <, end);
 
 		/* there is no overlap */
 		if (end <= zfs_rs_get_start(curr, removefrom)) {
 			zfs_range_tree_add(addto, start, end - start);
 			return;
 		}
 
 		uint64_t overlap_start = MAX(zfs_rs_get_start(curr, removefrom),
 		    start);
 		uint64_t overlap_end = MIN(zfs_rs_get_end(curr, removefrom),
 		    end);
 		uint64_t overlap_size = overlap_end - overlap_start;
 		ASSERT3S(overlap_size, >, 0);
-		range_seg_max_t rs;
+		zfs_range_seg_max_t rs;
 		zfs_rs_copy(curr, &rs, removefrom);
 
 		zfs_range_tree_remove(removefrom, overlap_start, overlap_size);
 
 		if (start < overlap_start)
 			zfs_range_tree_add(addto, start, overlap_start - start);
 
 		start = overlap_end;
 		next = zfs_btree_find(&removefrom->rt_root, &rs, &where);
 		/*
 		 * If we find something here, we only removed part of the
 		 * curr segment. Either there's some left at the end
 		 * because we've reached the end of the range we're removing,
 		 * or there's some left at the start because we started
 		 * partway through the range.  Either way, we continue with
 		 * the loop. If it's the former, we'll return at the start of
 		 * the loop, and if it's the latter we'll see if there is more
 		 * area to process.
 		 */
 		if (next != NULL) {
 			ASSERT(start == end || start == zfs_rs_get_end(&rs,
 			    removefrom));
 		}
 
 		next = zfs_btree_next(&removefrom->rt_root, &where, &where);
 	}
 	VERIFY3P(curr, ==, NULL);
 
 	if (start != end) {
 		VERIFY3U(start, <, end);
 		zfs_range_tree_add(addto, start, end - start);
 	} else {
 		VERIFY3U(start, ==, end);
 	}
 }
 
 /*
  * For each entry in rt, if it exists in removefrom, remove it
  * from removefrom. Otherwise, add it to addto.
  */
 void
 zfs_range_tree_remove_xor_add(zfs_range_tree_t *rt,
     zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
 {
 	zfs_btree_index_t where;
 	for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs;
 	    rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
 		zfs_range_tree_remove_xor_add_segment(zfs_rs_get_start(rs, rt),
 		    zfs_rs_get_end(rs, rt), removefrom, addto);
 	}
 }
 
 uint64_t
 zfs_range_tree_min(zfs_range_tree_t *rt)
 {
 	zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL);
 	return (rs != NULL ? zfs_rs_get_start(rs, rt) : 0);
 }
 
 uint64_t
 zfs_range_tree_max(zfs_range_tree_t *rt)
 {
 	zfs_range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL);
 	return (rs != NULL ? zfs_rs_get_end(rs, rt) : 0);
 }
 
 uint64_t
 zfs_range_tree_span(zfs_range_tree_t *rt)
 {
 	return (zfs_range_tree_max(rt) - zfs_range_tree_min(rt));
 }
diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c
index e9e03e05c86a..36e15b8d73af 100644
--- a/module/zfs/space_map.c
+++ b/module/zfs/space_map.c
@@ -1,1111 +1,1111 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/dnode.h>
 #include <sys/dsl_pool.h>
 #include <sys/zio.h>
 #include <sys/space_map.h>
 #include <sys/zfeature.h>
 
 /*
  * Note on space map block size:
  *
  * The data for a given space map can be kept on blocks of any size.
  * Larger blocks entail fewer I/O operations, but they also cause the
  * DMU to keep more data in-core, and also to waste more I/O bandwidth
  * when only a few blocks have changed since the last transaction group.
  */
 
 /*
  * Enabled whenever we want to stress test the use of double-word
  * space map entries.
  */
 boolean_t zfs_force_some_double_word_sm_entries = B_FALSE;
 
 /*
  * Override the default indirect block size of 128K, instead use 16K for
  * spacemaps (2^14 bytes).  This dramatically reduces write inflation since
  * appending to a spacemap typically has to write one data block (4KB) and one
  * or two indirect blocks (16K-32K, rather than 128K).
  */
 int space_map_ibs = 14;
 
 boolean_t
 sm_entry_is_debug(uint64_t e)
 {
 	return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX);
 }
 
 boolean_t
 sm_entry_is_single_word(uint64_t e)
 {
 	uint8_t prefix = SM_PREFIX_DECODE(e);
 	return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX);
 }
 
 boolean_t
 sm_entry_is_double_word(uint64_t e)
 {
 	return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
 }
 
 /*
  * Iterate through the space map, invoking the callback on each (non-debug)
  * space map entry. Stop after reading 'end' bytes of the space map.
  */
 int
 space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
 {
 	uint64_t blksz = sm->sm_blksz;
 
 	ASSERT3U(blksz, !=, 0);
 	ASSERT3U(end, <=, space_map_length(sm));
 	ASSERT0(P2PHASE(end, sizeof (uint64_t)));
 
 	dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end,
 	    ZIO_PRIORITY_SYNC_READ);
 
 	int error = 0;
 	uint64_t txg = 0, sync_pass = 0;
 	for (uint64_t block_base = 0; block_base < end && error == 0;
 	    block_base += blksz) {
 		dmu_buf_t *db;
 		error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
 		    block_base, FTAG, &db, DMU_READ_PREFETCH);
 		if (error != 0)
 			return (error);
 
 		uint64_t *block_start = db->db_data;
 		uint64_t block_length = MIN(end - block_base, blksz);
 		uint64_t *block_end = block_start +
 		    (block_length / sizeof (uint64_t));
 
 		VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
 		VERIFY3U(block_length, !=, 0);
 		ASSERT3U(blksz, ==, db->db_size);
 
 		for (uint64_t *block_cursor = block_start;
 		    block_cursor < block_end && error == 0; block_cursor++) {
 			uint64_t e = *block_cursor;
 
 			if (sm_entry_is_debug(e)) {
 				/*
 				 * Debug entries are only needed to record the
 				 * current TXG and sync pass if available.
 				 *
 				 * Note though that sometimes there can be
 				 * debug entries that are used as padding
 				 * at the end of space map blocks in-order
 				 * to not split a double-word entry in the
 				 * middle between two blocks. These entries
 				 * have their TXG field set to 0 and we
 				 * skip them without recording the TXG.
 				 * [see comment in space_map_write_seg()]
 				 */
 				uint64_t e_txg = SM_DEBUG_TXG_DECODE(e);
 				if (e_txg != 0) {
 					txg = e_txg;
 					sync_pass = SM_DEBUG_SYNCPASS_DECODE(e);
 				} else {
 					ASSERT0(SM_DEBUG_SYNCPASS_DECODE(e));
 				}
 				continue;
 			}
 
 			uint64_t raw_offset, raw_run, vdev_id;
 			maptype_t type;
 			if (sm_entry_is_single_word(e)) {
 				type = SM_TYPE_DECODE(e);
 				vdev_id = SM_NO_VDEVID;
 				raw_offset = SM_OFFSET_DECODE(e);
 				raw_run = SM_RUN_DECODE(e);
 			} else {
 				/* it is a two-word entry */
 				ASSERT(sm_entry_is_double_word(e));
 				raw_run = SM2_RUN_DECODE(e);
 				vdev_id = SM2_VDEV_DECODE(e);
 
 				/* move on to the second word */
 				block_cursor++;
 				e = *block_cursor;
 				VERIFY3P(block_cursor, <=, block_end);
 
 				type = SM2_TYPE_DECODE(e);
 				raw_offset = SM2_OFFSET_DECODE(e);
 			}
 
 			uint64_t entry_offset = (raw_offset << sm->sm_shift) +
 			    sm->sm_start;
 			uint64_t entry_run = raw_run << sm->sm_shift;
 
 			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
 			VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
 			ASSERT3U(entry_offset, >=, sm->sm_start);
 			ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size);
 			ASSERT3U(entry_run, <=, sm->sm_size);
 			ASSERT3U(entry_offset + entry_run, <=,
 			    sm->sm_start + sm->sm_size);
 
 			space_map_entry_t sme = {
 			    .sme_type = type,
 			    .sme_vdev = vdev_id,
 			    .sme_offset = entry_offset,
 			    .sme_run = entry_run,
 			    .sme_txg = txg,
 			    .sme_sync_pass = sync_pass
 			};
 			error = callback(&sme, arg);
 		}
 		dmu_buf_rele(db, FTAG);
 	}
 	return (error);
 }
 
 /*
  * Reads the entries from the last block of the space map into
  * buf in reverse order. Populates nwords with number of words
  * in the last block.
  *
  * Refer to block comment within space_map_incremental_destroy()
  * to understand why this function is needed.
  */
 static int
 space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
     uint64_t bufsz, uint64_t *nwords)
 {
 	int error = 0;
 	dmu_buf_t *db;
 
 	/*
 	 * Find the offset of the last word in the space map and use
 	 * that to read the last block of the space map with
 	 * dmu_buf_hold().
 	 */
 	uint64_t last_word_offset =
 	    sm->sm_phys->smp_length - sizeof (uint64_t);
 	error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
 	    FTAG, &db, DMU_READ_NO_PREFETCH);
 	if (error != 0)
 		return (error);
 
 	ASSERT3U(sm->sm_object, ==, db->db_object);
 	ASSERT3U(sm->sm_blksz, ==, db->db_size);
 	ASSERT3U(bufsz, >=, db->db_size);
 	ASSERT(nwords != NULL);
 
 	uint64_t *words = db->db_data;
 	*nwords =
 	    (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
 
 	ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
 
 	uint64_t n = *nwords;
 	uint64_t j = n - 1;
 	for (uint64_t i = 0; i < n; i++) {
 		uint64_t entry = words[i];
 		if (sm_entry_is_double_word(entry)) {
 			/*
 			 * Since we are populating the buffer backwards
 			 * we have to be extra careful and add the two
 			 * words of the double-word entry in the right
 			 * order.
 			 */
 			ASSERT3U(j, >, 0);
 			buf[j - 1] = entry;
 
 			i++;
 			ASSERT3U(i, <, n);
 			entry = words[i];
 			buf[j] = entry;
 			j -= 2;
 		} else {
 			ASSERT(sm_entry_is_debug(entry) ||
 			    sm_entry_is_single_word(entry));
 			buf[j] = entry;
 			j--;
 		}
 	}
 
 	/*
 	 * Assert that we wrote backwards all the
 	 * way to the beginning of the buffer.
 	 */
 	ASSERT3S(j, ==, -1);
 
 	dmu_buf_rele(db, FTAG);
 	return (error);
 }
 
 /*
  * Note: This function performs destructive actions - specifically
  * it deletes entries from the end of the space map. Thus, callers
  * should ensure that they are holding the appropriate locks for
  * the space map that they provide.
  */
 int
 space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
     dmu_tx_t *tx)
 {
 	uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
 	uint64_t *buf = zio_buf_alloc(bufsz);
 
 	dmu_buf_will_dirty(sm->sm_dbuf, tx);
 
 	/*
 	 * Ideally we would want to iterate from the beginning of the
 	 * space map to the end in incremental steps. The issue with this
 	 * approach is that we don't have any field on-disk that points
 	 * us where to start between each step. We could try zeroing out
 	 * entries that we've destroyed, but this doesn't work either as
 	 * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]).
 	 *
 	 * As a result, we destroy its entries incrementally starting from
 	 * the end after applying the callback to each of them.
 	 *
 	 * The problem with this approach is that we cannot literally
 	 * iterate through the words in the space map backwards as we
 	 * can't distinguish two-word space map entries from their second
 	 * word. Thus we do the following:
 	 *
 	 * 1] We get all the entries from the last block of the space map
 	 *    and put them into a buffer in reverse order. This way the
 	 *    last entry comes first in the buffer, the second to last is
 	 *    second, etc.
 	 * 2] We iterate through the entries in the buffer and we apply
 	 *    the callback to each one. As we move from entry to entry we
 	 *    we decrease the size of the space map, deleting effectively
 	 *    each entry.
 	 * 3] If there are no more entries in the space map or the callback
 	 *    returns a value other than 0, we stop iterating over the
 	 *    space map. If there are entries remaining and the callback
 	 *    returned 0, we go back to step [1].
 	 */
 	int error = 0;
 	while (space_map_length(sm) > 0 && error == 0) {
 		uint64_t nwords = 0;
 		error = space_map_reversed_last_block_entries(sm, buf, bufsz,
 		    &nwords);
 		if (error != 0)
 			break;
 
 		ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t));
 
 		for (uint64_t i = 0; i < nwords; i++) {
 			uint64_t e = buf[i];
 
 			if (sm_entry_is_debug(e)) {
 				sm->sm_phys->smp_length -= sizeof (uint64_t);
 				continue;
 			}
 
 			int words = 1;
 			uint64_t raw_offset, raw_run, vdev_id;
 			maptype_t type;
 			if (sm_entry_is_single_word(e)) {
 				type = SM_TYPE_DECODE(e);
 				vdev_id = SM_NO_VDEVID;
 				raw_offset = SM_OFFSET_DECODE(e);
 				raw_run = SM_RUN_DECODE(e);
 			} else {
 				ASSERT(sm_entry_is_double_word(e));
 				words = 2;
 
 				raw_run = SM2_RUN_DECODE(e);
 				vdev_id = SM2_VDEV_DECODE(e);
 
 				/* move to the second word */
 				i++;
 				e = buf[i];
 
 				ASSERT3P(i, <=, nwords);
 
 				type = SM2_TYPE_DECODE(e);
 				raw_offset = SM2_OFFSET_DECODE(e);
 			}
 
 			uint64_t entry_offset =
 			    (raw_offset << sm->sm_shift) + sm->sm_start;
 			uint64_t entry_run = raw_run << sm->sm_shift;
 
 			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
 			VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
 			VERIFY3U(entry_offset, >=, sm->sm_start);
 			VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size);
 			VERIFY3U(entry_run, <=, sm->sm_size);
 			VERIFY3U(entry_offset + entry_run, <=,
 			    sm->sm_start + sm->sm_size);
 
 			space_map_entry_t sme = {
 			    .sme_type = type,
 			    .sme_vdev = vdev_id,
 			    .sme_offset = entry_offset,
 			    .sme_run = entry_run
 			};
 			error = callback(&sme, arg);
 			if (error != 0)
 				break;
 
 			if (type == SM_ALLOC)
 				sm->sm_phys->smp_alloc -= entry_run;
 			else
 				sm->sm_phys->smp_alloc += entry_run;
 			sm->sm_phys->smp_length -= words * sizeof (uint64_t);
 		}
 	}
 
 	if (space_map_length(sm) == 0) {
 		ASSERT0(error);
 		ASSERT0(space_map_allocated(sm));
 	}
 
 	zio_buf_free(buf, bufsz);
 	return (error);
 }
 
 typedef struct space_map_load_arg {
 	space_map_t	*smla_sm;
 	zfs_range_tree_t	*smla_rt;
 	maptype_t	smla_type;
 } space_map_load_arg_t;
 
 static int
 space_map_load_callback(space_map_entry_t *sme, void *arg)
 {
 	space_map_load_arg_t *smla = arg;
 	if (sme->sme_type == smla->smla_type) {
 		VERIFY3U(zfs_range_tree_space(smla->smla_rt) + sme->sme_run, <=,
 		    smla->smla_sm->sm_size);
 		zfs_range_tree_add(smla->smla_rt, sme->sme_offset,
 		    sme->sme_run);
 	} else {
 		zfs_range_tree_remove(smla->smla_rt, sme->sme_offset,
 		    sme->sme_run);
 	}
 
 	return (0);
 }
 
 /*
  * Load the spacemap into the rangetree, like space_map_load. But only
  * read the first 'length' bytes of the spacemap.
  */
 int
 space_map_load_length(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype,
     uint64_t length)
 {
 	space_map_load_arg_t smla;
 
 	VERIFY0(zfs_range_tree_space(rt));
 
 	if (maptype == SM_FREE)
 		zfs_range_tree_add(rt, sm->sm_start, sm->sm_size);
 
 	smla.smla_rt = rt;
 	smla.smla_sm = sm;
 	smla.smla_type = maptype;
 	int err = space_map_iterate(sm, length,
 	    space_map_load_callback, &smla);
 
 	if (err != 0)
 		zfs_range_tree_vacate(rt, NULL, NULL);
 
 	return (err);
 }
 
 /*
  * Load the space map disk into the specified range tree. Segments of maptype
  * are added to the range tree, other segment types are removed.
  */
 int
 space_map_load(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype)
 {
 	return (space_map_load_length(sm, rt, maptype, space_map_length(sm)));
 }
 
 void
 space_map_histogram_clear(space_map_t *sm)
 {
 	if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
 		return;
 
 	memset(sm->sm_phys->smp_histogram, 0,
 	    sizeof (sm->sm_phys->smp_histogram));
 }
 
 boolean_t
 space_map_histogram_verify(space_map_t *sm, zfs_range_tree_t *rt)
 {
 	/*
 	 * Verify that the in-core range tree does not have any
 	 * ranges smaller than our sm_shift size.
 	 */
 	for (int i = 0; i < sm->sm_shift; i++) {
 		if (rt->rt_histogram[i] != 0)
 			return (B_FALSE);
 	}
 	return (B_TRUE);
 }
 
 void
 space_map_histogram_add(space_map_t *sm, zfs_range_tree_t *rt, dmu_tx_t *tx)
 {
 	int idx = 0;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	VERIFY3U(space_map_object(sm), !=, 0);
 
 	if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
 		return;
 
 	dmu_buf_will_dirty(sm->sm_dbuf, tx);
 
 	ASSERT(space_map_histogram_verify(sm, rt));
 	/*
 	 * Transfer the content of the range tree histogram to the space
 	 * map histogram. The space map histogram contains 32 buckets ranging
 	 * between 2^sm_shift to 2^(32+sm_shift-1). The range tree,
 	 * however, can represent ranges from 2^0 to 2^63. Since the space
 	 * map only cares about allocatable blocks (minimum of sm_shift) we
 	 * can safely ignore all ranges in the range tree smaller than sm_shift.
 	 */
-	for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+	for (int i = sm->sm_shift; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
 
 		/*
 		 * Since the largest histogram bucket in the space map is
 		 * 2^(32+sm_shift-1), we need to normalize the values in
 		 * the range tree for any bucket larger than that size. For
 		 * example given an sm_shift of 9, ranges larger than 2^40
 		 * would get normalized as if they were 1TB ranges. Assume
 		 * the range tree had a count of 5 in the 2^44 (16TB) bucket,
 		 * the calculation below would normalize this to 5 * 2^4 (16).
 		 */
 		ASSERT3U(i, >=, idx + sm->sm_shift);
 		sm->sm_phys->smp_histogram[idx] +=
 		    rt->rt_histogram[i] << (i - idx - sm->sm_shift);
 
 		/*
 		 * Increment the space map's index as long as we haven't
 		 * reached the maximum bucket size. Accumulate all ranges
 		 * larger than the max bucket size into the last bucket.
 		 */
 		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
 			ASSERT3U(idx + sm->sm_shift, ==, i);
 			idx++;
 			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
 		}
 	}
 }
 
 static void
 space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
 {
 	dmu_buf_will_dirty(sm->sm_dbuf, tx);
 
 	uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
 	    SM_DEBUG_ACTION_ENCODE(maptype) |
 	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
 	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
 
 	dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
 	    sizeof (dentry), &dentry, tx);
 
 	sm->sm_phys->smp_length += sizeof (dentry);
 }
 
 /*
  * Writes one or more entries given a segment.
  *
  * Note: The function may release the dbuf from the pointer initially
  * passed to it, and return a different dbuf. Also, the space map's
  * dbuf must be dirty for the changes in sm_phys to take effect.
  */
 static void
 space_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend,
     maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp,
     const void *tag, dmu_tx_t *tx)
 {
 	ASSERT3U(words, !=, 0);
 	ASSERT3U(words, <=, 2);
 
 	/* ensure the vdev_id can be represented by the space map */
 	ASSERT3U(vdev_id, <=, SM_NO_VDEVID);
 
 	/*
 	 * if this is a single word entry, ensure that no vdev was
 	 * specified.
 	 */
 	IMPLY(words == 1, vdev_id == SM_NO_VDEVID);
 
 	dmu_buf_t *db = *dbp;
 	ASSERT3U(db->db_size, ==, sm->sm_blksz);
 
 	uint64_t *block_base = db->db_data;
 	uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
 	uint64_t *block_cursor = block_base +
 	    (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
 
 	ASSERT3P(block_cursor, <=, block_end);
 
 	uint64_t size = (rend - rstart) >> sm->sm_shift;
 	uint64_t start = (rstart - sm->sm_start) >> sm->sm_shift;
 	uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX;
 
 	ASSERT3U(rstart, >=, sm->sm_start);
 	ASSERT3U(rstart, <, sm->sm_start + sm->sm_size);
 	ASSERT3U(rend - rstart, <=, sm->sm_size);
 	ASSERT3U(rend, <=, sm->sm_start + sm->sm_size);
 
 	while (size != 0) {
 		ASSERT3P(block_cursor, <=, block_end);
 
 		/*
 		 * If we are at the end of this block, flush it and start
 		 * writing again from the beginning.
 		 */
 		if (block_cursor == block_end) {
 			dmu_buf_rele(db, tag);
 
 			uint64_t next_word_offset = sm->sm_phys->smp_length;
 			VERIFY0(dmu_buf_hold(sm->sm_os,
 			    space_map_object(sm), next_word_offset,
 			    tag, &db, DMU_READ_PREFETCH));
 			dmu_buf_will_dirty(db, tx);
 
 			/* update caller's dbuf */
 			*dbp = db;
 
 			ASSERT3U(db->db_size, ==, sm->sm_blksz);
 
 			block_base = db->db_data;
 			block_cursor = block_base;
 			block_end = block_base +
 			    (db->db_size / sizeof (uint64_t));
 		}
 
 		/*
 		 * If we are writing a two-word entry and we only have one
 		 * word left on this block, just pad it with an empty debug
 		 * entry and write the two-word entry in the next block.
 		 */
 		uint64_t *next_entry = block_cursor + 1;
 		if (next_entry == block_end && words > 1) {
 			ASSERT3U(words, ==, 2);
 			*block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
 			    SM_DEBUG_ACTION_ENCODE(0) |
 			    SM_DEBUG_SYNCPASS_ENCODE(0) |
 			    SM_DEBUG_TXG_ENCODE(0);
 			block_cursor++;
 			sm->sm_phys->smp_length += sizeof (uint64_t);
 			ASSERT3P(block_cursor, ==, block_end);
 			continue;
 		}
 
 		uint64_t run_len = MIN(size, run_max);
 		switch (words) {
 		case 1:
 			*block_cursor = SM_OFFSET_ENCODE(start) |
 			    SM_TYPE_ENCODE(maptype) |
 			    SM_RUN_ENCODE(run_len);
 			block_cursor++;
 			break;
 		case 2:
 			/* write the first word of the entry */
 			*block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) |
 			    SM2_RUN_ENCODE(run_len) |
 			    SM2_VDEV_ENCODE(vdev_id);
 			block_cursor++;
 
 			/* move on to the second word of the entry */
 			ASSERT3P(block_cursor, <, block_end);
 			*block_cursor = SM2_TYPE_ENCODE(maptype) |
 			    SM2_OFFSET_ENCODE(start);
 			block_cursor++;
 			break;
 		default:
 			panic("%d-word space map entries are not supported",
 			    words);
 			break;
 		}
 		sm->sm_phys->smp_length += words * sizeof (uint64_t);
 
 		start += run_len;
 		size -= run_len;
 	}
 	ASSERT0(size);
 
 }
 
 /*
  * Note: The space map's dbuf must be dirty for the changes in sm_phys to
  * take effect.
  */
 static void
 space_map_write_impl(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype,
     uint64_t vdev_id, dmu_tx_t *tx)
 {
 	spa_t *spa = tx->tx_pool->dp_spa;
 	dmu_buf_t *db;
 
 	space_map_write_intro_debug(sm, maptype, tx);
 
 #ifdef ZFS_DEBUG
 	/*
 	 * We do this right after we write the intro debug entry
 	 * because the estimate does not take it into account.
 	 */
 	uint64_t initial_objsize = sm->sm_phys->smp_length;
 	uint64_t estimated_growth =
 	    space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
 	uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
 #endif
 
 	/*
 	 * Find the offset right after the last word in the space map
 	 * and use that to get a hold of the last block, so we can
 	 * start appending to it.
 	 */
 	uint64_t next_word_offset = sm->sm_phys->smp_length;
 	VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
 	    next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
 	ASSERT3U(db->db_size, ==, sm->sm_blksz);
 
 	dmu_buf_will_dirty(db, tx);
 
 	zfs_btree_t *t = &rt->rt_root;
 	zfs_btree_index_t where;
 	for (zfs_range_seg_t *rs = zfs_btree_first(t, &where); rs != NULL;
 	    rs = zfs_btree_next(t, &where, &where)) {
 		uint64_t offset = (zfs_rs_get_start(rs, rt) - sm->sm_start) >>
 		    sm->sm_shift;
 		uint64_t length = (zfs_rs_get_end(rs, rt) -
 		    zfs_rs_get_start(rs, rt)) >> sm->sm_shift;
 		uint8_t words = 1;
 
 		/*
 		 * We only write two-word entries when both of the following
 		 * are true:
 		 *
 		 * [1] The feature is enabled.
 		 * [2] The offset or run is too big for a single-word entry,
 		 *	or the vdev_id is set (meaning not equal to
 		 *	SM_NO_VDEVID).
 		 *
 		 * Note that for purposes of testing we've added the case that
 		 * we write two-word entries occasionally when the feature is
 		 * enabled and zfs_force_some_double_word_sm_entries has been
 		 * set.
 		 */
 		if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) &&
 		    (offset >= (1ULL << SM_OFFSET_BITS) ||
 		    length > SM_RUN_MAX ||
 		    vdev_id != SM_NO_VDEVID ||
 		    (zfs_force_some_double_word_sm_entries &&
 		    random_in_range(100) == 0)))
 			words = 2;
 
 		space_map_write_seg(sm, zfs_rs_get_start(rs, rt),
 		    zfs_rs_get_end(rs, rt), maptype, vdev_id, words, &db,
 		    FTAG, tx);
 	}
 
 	dmu_buf_rele(db, FTAG);
 
 #ifdef ZFS_DEBUG
 	/*
 	 * We expect our estimation to be based on the worst case
 	 * scenario [see comment in space_map_estimate_optimal_size()].
 	 * Therefore we expect the actual objsize to be equal or less
 	 * than whatever we estimated it to be.
 	 */
 	ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length);
 #endif
 }
 
 /*
  * Note: This function manipulates the state of the given space map but
  * does not hold any locks implicitly. Thus the caller is responsible
  * for synchronizing writes to the space map.
  */
 void
 space_map_write(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype,
     uint64_t vdev_id, dmu_tx_t *tx)
 {
 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(sm->sm_os)));
 	VERIFY3U(space_map_object(sm), !=, 0);
 
 	dmu_buf_will_dirty(sm->sm_dbuf, tx);
 
 	/*
 	 * This field is no longer necessary since the in-core space map
 	 * now contains the object number but is maintained for backwards
 	 * compatibility.
 	 */
 	sm->sm_phys->smp_object = sm->sm_object;
 
 	if (zfs_range_tree_is_empty(rt)) {
 		VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
 		return;
 	}
 
 	if (maptype == SM_ALLOC)
 		sm->sm_phys->smp_alloc += zfs_range_tree_space(rt);
 	else
 		sm->sm_phys->smp_alloc -= zfs_range_tree_space(rt);
 
 	uint64_t nodes = zfs_btree_numnodes(&rt->rt_root);
 	uint64_t rt_space = zfs_range_tree_space(rt);
 
 	space_map_write_impl(sm, rt, maptype, vdev_id, tx);
 
 	/*
 	 * Ensure that the space_map's accounting wasn't changed
 	 * while we were in the middle of writing it out.
 	 */
 	VERIFY3U(nodes, ==, zfs_btree_numnodes(&rt->rt_root));
 	VERIFY3U(zfs_range_tree_space(rt), ==, rt_space);
 }
 
 static int
 space_map_open_impl(space_map_t *sm)
 {
 	int error;
 	u_longlong_t blocks;
 
 	error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf);
 	if (error)
 		return (error);
 
 	dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks);
 	sm->sm_phys = sm->sm_dbuf->db_data;
 	return (0);
 }
 
 int
 space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
     uint64_t start, uint64_t size, uint8_t shift)
 {
 	space_map_t *sm;
 	int error;
 
 	ASSERT(*smp == NULL);
 	ASSERT(os != NULL);
 	ASSERT(object != 0);
 
 	sm = kmem_alloc(sizeof (space_map_t), KM_SLEEP);
 
 	sm->sm_start = start;
 	sm->sm_size = size;
 	sm->sm_shift = shift;
 	sm->sm_os = os;
 	sm->sm_object = object;
 	sm->sm_blksz = 0;
 	sm->sm_dbuf = NULL;
 	sm->sm_phys = NULL;
 
 	error = space_map_open_impl(sm);
 	if (error != 0) {
 		space_map_close(sm);
 		return (error);
 	}
 	*smp = sm;
 
 	return (0);
 }
 
 void
 space_map_close(space_map_t *sm)
 {
 	if (sm == NULL)
 		return;
 
 	if (sm->sm_dbuf != NULL)
 		dmu_buf_rele(sm->sm_dbuf, sm);
 	sm->sm_dbuf = NULL;
 	sm->sm_phys = NULL;
 
 	kmem_free(sm, sizeof (*sm));
 }
 
 void
 space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
 {
 	objset_t *os = sm->sm_os;
 	spa_t *spa = dmu_objset_spa(os);
 	dmu_object_info_t doi;
 
 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
 	ASSERT(dmu_tx_is_syncing(tx));
 	VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa));
 
 	dmu_object_info_from_db(sm->sm_dbuf, &doi);
 
 	/*
 	 * If the space map has the wrong bonus size (because
 	 * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or
 	 * the wrong block size (because space_map_blksz has changed),
 	 * free and re-allocate its object with the updated sizes.
 	 *
 	 * Otherwise, just truncate the current object.
 	 */
 	if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
 	    doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
 	    doi.doi_data_block_size != blocksize ||
 	    doi.doi_metadata_block_size != 1 << space_map_ibs) {
 		zfs_dbgmsg("txg %llu, spa %s, sm %px, reallocating "
 		    "object[%llu]: old bonus %llu, old blocksz %u",
 		    (u_longlong_t)dmu_tx_get_txg(tx), spa_name(spa), sm,
 		    (u_longlong_t)sm->sm_object,
 		    (u_longlong_t)doi.doi_bonus_size,
 		    doi.doi_data_block_size);
 
 		space_map_free(sm, tx);
 		dmu_buf_rele(sm->sm_dbuf, sm);
 
 		sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx);
 		VERIFY0(space_map_open_impl(sm));
 	} else {
 		VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
 
 		/*
 		 * If the spacemap is reallocated, its histogram
 		 * will be reset.  Do the same in the common case so that
 		 * bugs related to the uncommon case do not go unnoticed.
 		 */
 		memset(sm->sm_phys->smp_histogram, 0,
 		    sizeof (sm->sm_phys->smp_histogram));
 	}
 
 	dmu_buf_will_dirty(sm->sm_dbuf, tx);
 	sm->sm_phys->smp_length = 0;
 	sm->sm_phys->smp_alloc = 0;
 }
 
 uint64_t
 space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_objset_spa(os);
 	uint64_t object;
 	int bonuslen;
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
 		spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
 		bonuslen = sizeof (space_map_phys_t);
 		ASSERT3U(bonuslen, <=, dmu_bonus_max());
 	} else {
 		bonuslen = SPACE_MAP_SIZE_V0;
 	}
 
 	object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize,
 	    space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
 
 	return (object);
 }
 
 void
 space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_objset_spa(os);
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
 		dmu_object_info_t doi;
 
 		VERIFY0(dmu_object_info(os, smobj, &doi));
 		if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) {
 			spa_feature_decr(spa,
 			    SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
 		}
 	}
 
 	VERIFY0(dmu_object_free(os, smobj, tx));
 }
 
 void
 space_map_free(space_map_t *sm, dmu_tx_t *tx)
 {
 	if (sm == NULL)
 		return;
 
 	space_map_free_obj(sm->sm_os, space_map_object(sm), tx);
 	sm->sm_object = 0;
 }
 
 /*
  * Given a range tree, it makes a worst-case estimate of how much
  * space would the tree's segments take if they were written to
  * the given space map.
  */
 uint64_t
 space_map_estimate_optimal_size(space_map_t *sm, zfs_range_tree_t *rt,
     uint64_t vdev_id)
 {
 	spa_t *spa = dmu_objset_spa(sm->sm_os);
 	uint64_t shift = sm->sm_shift;
 	uint64_t *histogram = rt->rt_histogram;
 	uint64_t entries_for_seg = 0;
 
 	/*
 	 * In order to get a quick estimate of the optimal size that this
 	 * range tree would have on-disk as a space map, we iterate through
 	 * its histogram buckets instead of iterating through its nodes.
 	 *
 	 * Note that this is a highest-bound/worst-case estimate for the
 	 * following reasons:
 	 *
 	 * 1] We assume that we always add a debug padding for each block
 	 *    we write and we also assume that we start at the last word
 	 *    of a block attempting to write a two-word entry.
 	 * 2] Rounding up errors due to the way segments are distributed
 	 *    in the buckets of the range tree's histogram.
 	 * 3] The activation of zfs_force_some_double_word_sm_entries
 	 *    (tunable) when testing.
 	 *
 	 * = Math and Rounding Errors =
 	 *
 	 * rt_histogram[i] bucket of a range tree represents the number
 	 * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given
 	 * that, we want to divide the buckets into groups: Buckets that
 	 * can be represented using a single-word entry, ones that can
 	 * be represented with a double-word entry, and ones that can
 	 * only be represented with multiple two-word entries.
 	 *
 	 * [Note that if the new encoding feature is not enabled there
 	 * are only two groups: single-word entry buckets and multiple
 	 * single-word entry buckets. The information below assumes
 	 * two-word entries enabled, but it can easily applied when
 	 * the feature is not enabled]
 	 *
 	 * To find the highest bucket that can be represented with a
 	 * single-word entry we look at the maximum run that such entry
 	 * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that
 	 * the run of a space map entry is shifted by sm_shift, thus we
 	 * add it to the exponent]. This way, excluding the value of the
 	 * maximum run that can be represented by a single-word entry,
 	 * all runs that are smaller exist in buckets 0 to
 	 * SM_RUN_BITS + shift - 1.
 	 *
 	 * To find the highest bucket that can be represented with a
 	 * double-word entry, we follow the same approach. Finally, any
 	 * bucket higher than that are represented with multiple two-word
 	 * entries. To be more specific, if the highest bucket whose
 	 * segments can be represented with a single two-word entry is X,
 	 * then bucket X+1 will need 2 two-word entries for each of its
 	 * segments, X+2 will need 4, X+3 will need 8, ...etc.
 	 *
 	 * With all of the above we make our estimation based on bucket
 	 * groups. There is a rounding error though. As we mentioned in
 	 * the example with the one-word entry, the maximum run that can
 	 * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is
 	 * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of
 	 * that length fall into the next bucket (and bucket group) where
 	 * we start counting two-word entries and this is one more reason
 	 * why the estimated size may end up being bigger than the actual
 	 * size written.
 	 */
 	uint64_t size = 0;
 	uint64_t idx = 0;
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) ||
 	    (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) {
 
 		/*
 		 * If we are trying to force some double word entries just
 		 * assume the worst-case of every single word entry being
 		 * written as a double word entry.
 		 */
 		uint64_t entry_size =
 		    (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) &&
 		    zfs_force_some_double_word_sm_entries) ?
 		    (2 * sizeof (uint64_t)) : sizeof (uint64_t);
 
 		uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1;
 		for (; idx <= single_entry_max_bucket; idx++)
 			size += histogram[idx] * entry_size;
 
 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) {
-			for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
+			for (; idx < ZFS_RANGE_TREE_HISTOGRAM_SIZE; idx++) {
 				ASSERT3U(idx, >=, single_entry_max_bucket);
 				entries_for_seg =
 				    1ULL << (idx - single_entry_max_bucket);
 				size += histogram[idx] *
 				    entries_for_seg * entry_size;
 			}
 			return (size);
 		}
 	}
 
 	ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2));
 
 	uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1;
 	for (; idx <= double_entry_max_bucket; idx++)
 		size += histogram[idx] * 2 * sizeof (uint64_t);
 
-	for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
+	for (; idx < ZFS_RANGE_TREE_HISTOGRAM_SIZE; idx++) {
 		ASSERT3U(idx, >=, double_entry_max_bucket);
 		entries_for_seg = 1ULL << (idx - double_entry_max_bucket);
 		size += histogram[idx] *
 		    entries_for_seg * 2 * sizeof (uint64_t);
 	}
 
 	/*
 	 * Assume the worst case where we start with the padding at the end
 	 * of the current block and we add an extra padding entry at the end
 	 * of all subsequent blocks.
 	 */
 	size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t);
 
 	return (size);
 }
 
 uint64_t
 space_map_object(space_map_t *sm)
 {
 	return (sm != NULL ? sm->sm_object : 0);
 }
 
 int64_t
 space_map_allocated(space_map_t *sm)
 {
 	return (sm != NULL ? sm->sm_phys->smp_alloc : 0);
 }
 
 uint64_t
 space_map_length(space_map_t *sm)
 {
 	return (sm != NULL ? sm->sm_phys->smp_length : 0);
 }
 
 uint64_t
 space_map_nblocks(space_map_t *sm)
 {
 	if (sm == NULL)
 		return (0);
 	return (DIV_ROUND_UP(space_map_length(sm), sm->sm_blksz));
 }
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 40fd75b83639..74e36c0300f0 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1,6588 +1,6588 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Datto Inc. All rights reserved.
  * Copyright (c) 2021, Klara Inc.
  * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/bpobj.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_dir.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_rebuild.h>
 #include <sys/vdev_draid.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/space_map.h>
 #include <sys/space_reftree.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/zil.h>
 #include <sys/dsl_scan.h>
 #include <sys/vdev_raidz.h>
 #include <sys/abd.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_trim.h>
 #include <sys/vdev_raidz.h>
 #include <sys/zvol.h>
 #include <sys/zfs_ratelimit.h>
 #include "zfs_prop.h"
 
 /*
  * One metaslab from each (normal-class) vdev is used by the ZIL.  These are
  * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are
  * part of the spa_embedded_log_class.  The metaslab with the most free space
  * in each vdev is selected for this purpose when the pool is opened (or a
  * vdev is added).  See vdev_metaslab_init().
  *
  * Log blocks can be allocated from the following locations.  Each one is tried
  * in order until the allocation succeeds:
  * 1. dedicated log vdevs, aka "slog" (spa_log_class)
  * 2. embedded slog metaslabs (spa_embedded_log_class)
  * 3. other metaslabs in normal vdevs (spa_normal_class)
  *
  * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer
  * than this number of metaslabs in the vdev.  This ensures that we don't set
  * aside an unreasonable amount of space for the ZIL.  If set to less than
  * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
  * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
  */
 static uint_t zfs_embedded_slog_min_ms = 64;
 
 /* default target for number of metaslabs per top-level vdev */
 static uint_t zfs_vdev_default_ms_count = 200;
 
 /* minimum number of metaslabs per top-level vdev */
 static uint_t zfs_vdev_min_ms_count = 16;
 
 /* practical upper limit of total metaslabs per top-level vdev */
 static uint_t zfs_vdev_ms_count_limit = 1ULL << 17;
 
 /* lower limit for metaslab size (512M) */
 static uint_t zfs_vdev_default_ms_shift = 29;
 
 /* upper limit for metaslab size (16G) */
 static uint_t zfs_vdev_max_ms_shift = 34;
 
 int vdev_validate_skip = B_FALSE;
 
 /*
  * Since the DTL space map of a vdev is not expected to have a lot of
  * entries, we default its block size to 4K.
  */
 int zfs_vdev_dtl_sm_blksz = (1 << 12);
 
 /*
  * Rate limit slow IO (delay) events to this many per second.
  */
 static unsigned int zfs_slow_io_events_per_second = 20;
 
 /*
  * Rate limit deadman "hung IO" events to this many per second.
  */
 static unsigned int zfs_deadman_events_per_second = 1;
 
 /*
  * Rate limit direct write IO verify failures to this many per scond.
  */
 static unsigned int zfs_dio_write_verify_events_per_second = 20;
 
 /*
  * Rate limit checksum events after this many checksum errors per second.
  */
 static unsigned int zfs_checksum_events_per_second = 20;
 
 /*
  * Ignore errors during scrub/resilver.  Allows to work around resilver
  * upon import when there are pool errors.
  */
 static int zfs_scan_ignore_errors = 0;
 
 /*
  * vdev-wide space maps that have lots of entries written to them at
  * the end of each transaction can benefit from a higher I/O bandwidth
  * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
  */
 int zfs_vdev_standard_sm_blksz = (1 << 17);
 
 /*
  * Tunable parameter for debugging or performance analysis. Setting this
  * will cause pool corruption on power loss if a volatile out-of-order
  * write cache is enabled.
  */
 int zfs_nocacheflush = 0;
 
 /*
  * Maximum and minimum ashift values that can be automatically set based on
  * vdev's physical ashift (disk's physical sector size).  While ASHIFT_MAX
  * is higher than the maximum value, it is intentionally limited here to not
  * excessively impact pool space efficiency.  Higher ashift values may still
  * be forced by vdev logical ashift or by user via ashift property, but won't
  * be set automatically as a performance optimization.
  */
 uint_t zfs_vdev_max_auto_ashift = 14;
 uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
 
 /*
  * VDEV checksum verification for Direct I/O writes. This is neccessary for
  * Linux, because anonymous pages can not be placed under write protection
  * during Direct I/O writes.
  */
 #if !defined(__FreeBSD__)
 uint_t zfs_vdev_direct_write_verify = 1;
 #else
 uint_t zfs_vdev_direct_write_verify = 0;
 #endif
 
 void
 vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
 {
 	va_list adx;
 	char buf[256];
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	if (vd->vdev_path != NULL) {
 		zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
 		    vd->vdev_path, buf);
 	} else {
 		zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
 		    vd->vdev_ops->vdev_op_type,
 		    (u_longlong_t)vd->vdev_id,
 		    (u_longlong_t)vd->vdev_guid, buf);
 	}
 }
 
 void
 vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
 {
 	char state[20];
 
 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
 		zfs_dbgmsg("%*svdev %llu: %s", indent, "",
 		    (u_longlong_t)vd->vdev_id,
 		    vd->vdev_ops->vdev_op_type);
 		return;
 	}
 
 	switch (vd->vdev_state) {
 	case VDEV_STATE_UNKNOWN:
 		(void) snprintf(state, sizeof (state), "unknown");
 		break;
 	case VDEV_STATE_CLOSED:
 		(void) snprintf(state, sizeof (state), "closed");
 		break;
 	case VDEV_STATE_OFFLINE:
 		(void) snprintf(state, sizeof (state), "offline");
 		break;
 	case VDEV_STATE_REMOVED:
 		(void) snprintf(state, sizeof (state), "removed");
 		break;
 	case VDEV_STATE_CANT_OPEN:
 		(void) snprintf(state, sizeof (state), "can't open");
 		break;
 	case VDEV_STATE_FAULTED:
 		(void) snprintf(state, sizeof (state), "faulted");
 		break;
 	case VDEV_STATE_DEGRADED:
 		(void) snprintf(state, sizeof (state), "degraded");
 		break;
 	case VDEV_STATE_HEALTHY:
 		(void) snprintf(state, sizeof (state), "healthy");
 		break;
 	default:
 		(void) snprintf(state, sizeof (state), "<state %u>",
 		    (uint_t)vd->vdev_state);
 	}
 
 	zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
 	    "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
 	    vd->vdev_islog ? " (log)" : "",
 	    (u_longlong_t)vd->vdev_guid,
 	    vd->vdev_path ? vd->vdev_path : "N/A", state);
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++)
 		vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
 }
 
 /*
  * Virtual device management.
  */
 
 static vdev_ops_t *const vdev_ops_table[] = {
 	&vdev_root_ops,
 	&vdev_raidz_ops,
 	&vdev_draid_ops,
 	&vdev_draid_spare_ops,
 	&vdev_mirror_ops,
 	&vdev_replacing_ops,
 	&vdev_spare_ops,
 	&vdev_disk_ops,
 	&vdev_file_ops,
 	&vdev_missing_ops,
 	&vdev_hole_ops,
 	&vdev_indirect_ops,
 	NULL
 };
 
 /*
  * Given a vdev type, return the appropriate ops vector.
  */
 static vdev_ops_t *
 vdev_getops(const char *type)
 {
 	vdev_ops_t *ops, *const *opspp;
 
 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
 		if (strcmp(ops->vdev_op_type, type) == 0)
 			break;
 
 	return (ops);
 }
 
 /*
  * Given a vdev and a metaslab class, find which metaslab group we're
  * interested in. All vdevs may belong to two different metaslab classes.
  * Dedicated slog devices use only the primary metaslab group, rather than a
  * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
  */
 metaslab_group_t *
 vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
 {
 	if (mc == spa_embedded_log_class(vd->vdev_spa) &&
 	    vd->vdev_log_mg != NULL)
 		return (vd->vdev_log_mg);
 	else
 		return (vd->vdev_mg);
 }
 
 void
-vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
-    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
+vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
+    zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
 {
 	(void) vd, (void) remain_rs;
 
 	physical_rs->rs_start = logical_rs->rs_start;
 	physical_rs->rs_end = logical_rs->rs_end;
 }
 
 /*
  * Derive the enumerated allocation bias from string input.
  * String origin is either the per-vdev zap or zpool(8).
  */
 static vdev_alloc_bias_t
 vdev_derive_alloc_bias(const char *bias)
 {
 	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
 
 	if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
 		alloc_bias = VDEV_BIAS_LOG;
 	else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
 		alloc_bias = VDEV_BIAS_SPECIAL;
 	else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
 		alloc_bias = VDEV_BIAS_DEDUP;
 
 	return (alloc_bias);
 }
 
 /*
  * Default asize function: return the MAX of psize with the asize of
  * all children.  This is what's used by anything other than RAID-Z.
  */
 uint64_t
 vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
 	uint64_t csize;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg);
 		asize = MAX(asize, csize);
 	}
 
 	return (asize);
 }
 
 uint64_t
 vdev_default_min_asize(vdev_t *vd)
 {
 	return (vd->vdev_min_asize);
 }
 
 /*
  * Get the minimum allocatable size. We define the allocatable size as
  * the vdev's asize rounded to the nearest metaslab. This allows us to
  * replace or attach devices which don't have the same physical size but
  * can still satisfy the same number of allocations.
  */
 uint64_t
 vdev_get_min_asize(vdev_t *vd)
 {
 	vdev_t *pvd = vd->vdev_parent;
 
 	/*
 	 * If our parent is NULL (inactive spare or cache) or is the root,
 	 * just return our own asize.
 	 */
 	if (pvd == NULL)
 		return (vd->vdev_asize);
 
 	/*
 	 * The top-level vdev just returns the allocatable size rounded
 	 * to the nearest metaslab.
 	 */
 	if (vd == vd->vdev_top)
 		return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift,
 		    uint64_t));
 
 	return (pvd->vdev_ops->vdev_op_min_asize(pvd));
 }
 
 void
 vdev_set_min_asize(vdev_t *vd)
 {
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_set_min_asize(vd->vdev_child[c]);
 }
 
 /*
  * Get the minimal allocation size for the top-level vdev.
  */
 uint64_t
 vdev_get_min_alloc(vdev_t *vd)
 {
 	uint64_t min_alloc = 1ULL << vd->vdev_ashift;
 
 	if (vd->vdev_ops->vdev_op_min_alloc != NULL)
 		min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
 
 	return (min_alloc);
 }
 
 /*
  * Get the parity level for a top-level vdev.
  */
 uint64_t
 vdev_get_nparity(vdev_t *vd)
 {
 	uint64_t nparity = 0;
 
 	if (vd->vdev_ops->vdev_op_nparity != NULL)
 		nparity = vd->vdev_ops->vdev_op_nparity(vd);
 
 	return (nparity);
 }
 
 static int
 vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t objid;
 	int err;
 
 	if (vd->vdev_root_zap != 0) {
 		objid = vd->vdev_root_zap;
 	} else if (vd->vdev_top_zap != 0) {
 		objid = vd->vdev_top_zap;
 	} else if (vd->vdev_leaf_zap != 0) {
 		objid = vd->vdev_leaf_zap;
 	} else {
 		return (EINVAL);
 	}
 
 	err = zap_lookup(mos, objid, vdev_prop_to_name(prop),
 	    sizeof (uint64_t), 1, value);
 
 	if (err == ENOENT)
 		*value = vdev_prop_default_numeric(prop);
 
 	return (err);
 }
 
 /*
  * Get the number of data disks for a top-level vdev.
  */
 uint64_t
 vdev_get_ndisks(vdev_t *vd)
 {
 	uint64_t ndisks = 1;
 
 	if (vd->vdev_ops->vdev_op_ndisks != NULL)
 		ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
 
 	return (ndisks);
 }
 
 vdev_t *
 vdev_lookup_top(spa_t *spa, uint64_t vdev)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	if (vdev < rvd->vdev_children) {
 		ASSERT(rvd->vdev_child[vdev] != NULL);
 		return (rvd->vdev_child[vdev]);
 	}
 
 	return (NULL);
 }
 
 vdev_t *
 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
 {
 	vdev_t *mvd;
 
 	if (vd->vdev_guid == guid)
 		return (vd);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
 		    NULL)
 			return (mvd);
 
 	return (NULL);
 }
 
 static int
 vdev_count_leaves_impl(vdev_t *vd)
 {
 	int n = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		return (1);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		n += vdev_count_leaves_impl(vd->vdev_child[c]);
 
 	return (n);
 }
 
 int
 vdev_count_leaves(spa_t *spa)
 {
 	int rc;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	rc = vdev_count_leaves_impl(spa->spa_root_vdev);
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	return (rc);
 }
 
 void
 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 {
 	size_t oldsize, newsize;
 	uint64_t id = cvd->vdev_id;
 	vdev_t **newchild;
 
 	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(cvd->vdev_parent == NULL);
 
 	cvd->vdev_parent = pvd;
 
 	if (pvd == NULL)
 		return;
 
 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
 
 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
 	newsize = pvd->vdev_children * sizeof (vdev_t *);
 
 	newchild = kmem_alloc(newsize, KM_SLEEP);
 	if (pvd->vdev_child != NULL) {
 		memcpy(newchild, pvd->vdev_child, oldsize);
 		kmem_free(pvd->vdev_child, oldsize);
 	}
 
 	pvd->vdev_child = newchild;
 	pvd->vdev_child[id] = cvd;
 
 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
 
 	/*
 	 * Walk up all ancestors to update guid sum.
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
 
 	if (cvd->vdev_ops->vdev_op_leaf) {
 		list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
 		cvd->vdev_spa->spa_leaf_list_gen++;
 	}
 }
 
 void
 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
 {
 	int c;
 	uint_t id = cvd->vdev_id;
 
 	ASSERT(cvd->vdev_parent == pvd);
 
 	if (pvd == NULL)
 		return;
 
 	ASSERT(id < pvd->vdev_children);
 	ASSERT(pvd->vdev_child[id] == cvd);
 
 	pvd->vdev_child[id] = NULL;
 	cvd->vdev_parent = NULL;
 
 	for (c = 0; c < pvd->vdev_children; c++)
 		if (pvd->vdev_child[c])
 			break;
 
 	if (c == pvd->vdev_children) {
 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
 		pvd->vdev_child = NULL;
 		pvd->vdev_children = 0;
 	}
 
 	if (cvd->vdev_ops->vdev_op_leaf) {
 		spa_t *spa = cvd->vdev_spa;
 		list_remove(&spa->spa_leaf_list, cvd);
 		spa->spa_leaf_list_gen++;
 	}
 
 	/*
 	 * Walk up all ancestors to update guid sum.
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
 }
 
 /*
  * Remove any holes in the child array.
  */
 void
 vdev_compact_children(vdev_t *pvd)
 {
 	vdev_t **newchild, *cvd;
 	int oldc = pvd->vdev_children;
 	int newc;
 
 	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (oldc == 0)
 		return;
 
 	for (int c = newc = 0; c < oldc; c++)
 		if (pvd->vdev_child[c])
 			newc++;
 
 	if (newc > 0) {
 		newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP);
 
 		for (int c = newc = 0; c < oldc; c++) {
 			if ((cvd = pvd->vdev_child[c]) != NULL) {
 				newchild[newc] = cvd;
 				cvd->vdev_id = newc++;
 			}
 		}
 	} else {
 		newchild = NULL;
 	}
 
 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
 	pvd->vdev_child = newchild;
 	pvd->vdev_children = newc;
 }
 
 /*
  * Allocate and minimally initialize a vdev_t.
  */
 vdev_t *
 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 {
 	vdev_t *vd;
 	vdev_indirect_config_t *vic;
 
 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
 	vic = &vd->vdev_indirect_config;
 
 	if (spa->spa_root_vdev == NULL) {
 		ASSERT(ops == &vdev_root_ops);
 		spa->spa_root_vdev = vd;
 		spa->spa_load_guid = spa_generate_load_guid();
 	}
 
 	if (guid == 0 && ops != &vdev_hole_ops) {
 		if (spa->spa_root_vdev == vd) {
 			/*
 			 * The root vdev's guid will also be the pool guid,
 			 * which must be unique among all pools.
 			 */
 			guid = spa_generate_guid(NULL);
 		} else {
 			/*
 			 * Any other vdev's guid must be unique within the pool.
 			 */
 			guid = spa_generate_guid(spa);
 		}
 		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
 	}
 
 	vd->vdev_spa = spa;
 	vd->vdev_id = id;
 	vd->vdev_guid = guid;
 	vd->vdev_guid_sum = guid;
 	vd->vdev_ops = ops;
 	vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_ishole = (ops == &vdev_hole_ops);
 	vic->vic_prev_indirect_vdev = UINT64_MAX;
 
 	rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
 	vd->vdev_obsolete_segments = zfs_range_tree_create(NULL,
 	    ZFS_RANGE_SEG64, NULL, 0, 0);
 
 	/*
 	 * Initialize rate limit structs for events.  We rate limit ZIO delay
 	 * and checksum events so that we don't overwhelm ZED with thousands
 	 * of events when a disk is acting up.
 	 */
 	zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
 	    1);
 	zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second,
 	    1);
 	zfs_ratelimit_init(&vd->vdev_dio_verify_rl,
 	    &zfs_dio_write_verify_events_per_second, 1);
 	zfs_ratelimit_init(&vd->vdev_checksum_rl,
 	    &zfs_checksum_events_per_second, 1);
 
 	/*
 	 * Default Thresholds for tuning ZED
 	 */
 	vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N);
 	vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
 	vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
 	vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
 	vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
 	vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
 
 	list_link_init(&vd->vdev_config_dirty_node);
 	list_link_init(&vd->vdev_state_dirty_node);
 	list_link_init(&vd->vdev_initialize_node);
 	list_link_init(&vd->vdev_leaf_node);
 	list_link_init(&vd->vdev_trim_node);
 
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
 
 	mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
 
 	mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
 
 	for (int t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
 		    NULL, 0, 0);
 	}
 
 	txg_list_create(&vd->vdev_ms_list, spa,
 	    offsetof(struct metaslab, ms_txg_node));
 	txg_list_create(&vd->vdev_dtl_list, spa,
 	    offsetof(struct vdev, vdev_dtl_node));
 	vd->vdev_stat.vs_timestamp = gethrtime();
 	vdev_queue_init(vd);
 
 	return (vd);
 }
 
 /*
  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
  * creating a new vdev or loading an existing one - the behavior is slightly
  * different for each case.
  */
 int
 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
     int alloctype)
 {
 	vdev_ops_t *ops;
 	const char *type;
 	uint64_t guid = 0, islog;
 	vdev_t *vd;
 	vdev_indirect_config_t *vic;
 	const char *tmp = NULL;
 	int rc;
 	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
 	boolean_t top_level = (parent && !parent->vdev_parent);
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if ((ops = vdev_getops(type)) == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * If this is a load, get the vdev guid from the nvlist.
 	 * Otherwise, vdev_alloc_common() will generate one for us.
 	 */
 	if (alloctype == VDEV_ALLOC_LOAD) {
 		uint64_t label_id;
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
 		    label_id != id)
 			return (SET_ERROR(EINVAL));
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_SPARE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * The first allocated vdev must be of type 'root'.
 	 */
 	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Determine whether we're a log vdev.
 	 */
 	islog = 0;
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
 	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
 		return (SET_ERROR(ENOTSUP));
 
 	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
 		return (SET_ERROR(ENOTSUP));
 
 	if (top_level && alloctype == VDEV_ALLOC_ADD) {
 		const char *bias;
 
 		/*
 		 * If creating a top-level vdev, check for allocation
 		 * classes input.
 		 */
 		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
 		    &bias) == 0) {
 			alloc_bias = vdev_derive_alloc_bias(bias);
 
 			/* spa_vdev_add() expects feature to be enabled */
 			if (spa->spa_load_state != SPA_LOAD_CREATE &&
 			    !spa_feature_is_enabled(spa,
 			    SPA_FEATURE_ALLOCATION_CLASSES)) {
 				return (SET_ERROR(ENOTSUP));
 			}
 		}
 
 		/* spa_vdev_add() expects feature to be enabled */
 		if (ops == &vdev_draid_ops &&
 		    spa->spa_load_state != SPA_LOAD_CREATE &&
 		    !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
 			return (SET_ERROR(ENOTSUP));
 		}
 	}
 
 	/*
 	 * Initialize the vdev specific data.  This is done before calling
 	 * vdev_alloc_common() since it may fail and this simplifies the
 	 * error reporting and cleanup code paths.
 	 */
 	void *tsd = NULL;
 	if (ops->vdev_op_init != NULL) {
 		rc = ops->vdev_op_init(spa, nv, &tsd);
 		if (rc != 0) {
 			return (rc);
 		}
 	}
 
 	vd = vdev_alloc_common(spa, id, guid, ops);
 	vd->vdev_tsd = tsd;
 	vd->vdev_islog = islog;
 
 	if (top_level && alloc_bias != VDEV_BIAS_NONE)
 		vd->vdev_alloc_bias = alloc_bias;
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0)
 		vd->vdev_path = spa_strdup(tmp);
 
 	/*
 	 * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a
 	 * fault on a vdev and want it to persist across imports (like with
 	 * zpool offline -f).
 	 */
 	rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp);
 	if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
 		vd->vdev_faulted = 1;
 		vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 	}
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0)
 		vd->vdev_devid = spa_strdup(tmp);
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0)
 		vd->vdev_physpath = spa_strdup(tmp);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
 	    &tmp) == 0)
 		vd->vdev_enc_sysfs_path = spa_strdup(tmp);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0)
 		vd->vdev_fru = spa_strdup(tmp);
 
 	/*
 	 * Set the whole_disk property.  If it's not specified, leave the value
 	 * as -1.
 	 */
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 	    &vd->vdev_wholedisk) != 0)
 		vd->vdev_wholedisk = -1ULL;
 
 	vic = &vd->vdev_indirect_config;
 
 	ASSERT0(vic->vic_mapping_object);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
 	    &vic->vic_mapping_object);
 	ASSERT0(vic->vic_births_object);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
 	    &vic->vic_births_object);
 	ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
 	    &vic->vic_prev_indirect_vdev);
 
 	/*
 	 * Look for the 'not present' flag.  This will only be set if the device
 	 * was not present at the time of import.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
 	    &vd->vdev_not_present);
 
 	/*
 	 * Get the alignment requirement. Ignore pool ashift for vdev
 	 * attach case.
 	 */
 	if (alloctype != VDEV_ALLOC_ATTACH) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
 		    &vd->vdev_ashift);
 	} else {
 		vd->vdev_attaching = B_TRUE;
 	}
 
 	/*
 	 * Retrieve the vdev creation time.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
 	    &vd->vdev_crtxg);
 
 	if (vd->vdev_ops == &vdev_root_ops &&
 	    (alloctype == VDEV_ALLOC_LOAD ||
 	    alloctype == VDEV_ALLOC_SPLIT ||
 	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP,
 		    &vd->vdev_root_zap);
 	}
 
 	/*
 	 * If we're a top-level vdev, try to load the allocation parameters.
 	 */
 	if (top_level &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 		    &vd->vdev_ms_array);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 		    &vd->vdev_ms_shift);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    &vd->vdev_asize);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
 		    &vd->vdev_noalloc);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
 		    &vd->vdev_removing);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
 		    &vd->vdev_top_zap);
 		vd->vdev_rz_expanding = nvlist_exists(nv,
 		    ZPOOL_CONFIG_RAIDZ_EXPANDING);
 	} else {
 		ASSERT0(vd->vdev_top_zap);
 	}
 
 	if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
 		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
 		    alloctype == VDEV_ALLOC_ADD ||
 		    alloctype == VDEV_ALLOC_SPLIT ||
 		    alloctype == VDEV_ALLOC_ROOTPOOL);
 		/* Note: metaslab_group_create() is now deferred */
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 		(void) nvlist_lookup_uint64(nv,
 		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
 	} else {
 		ASSERT0(vd->vdev_leaf_zap);
 	}
 
 	/*
 	 * If we're a leaf vdev, try to load the DTL object and other state.
 	 */
 
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
 	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
 		if (alloctype == VDEV_ALLOC_LOAD) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
 			    &vd->vdev_dtl_object);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 			    &vd->vdev_unspare);
 		}
 
 		if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 			uint64_t spare = 0;
 
 			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
 			    &spare) == 0 && spare)
 				spa_spare_add(vd);
 		}
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 		    &vd->vdev_offline);
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 		    &vd->vdev_resilver_txg);
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
 		    &vd->vdev_rebuild_txg);
 
 		if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
 			vdev_defer_resilver(vd);
 
 		/*
 		 * In general, when importing a pool we want to ignore the
 		 * persistent fault state, as the diagnosis made on another
 		 * system may not be valid in the current context.  The only
 		 * exception is if we forced a vdev to a persistently faulted
 		 * state with 'zpool offline -f'.  The persistent fault will
 		 * remain across imports until cleared.
 		 *
 		 * Local vdevs will remain in the faulted state.
 		 */
 		if (spa_load_state(spa) == SPA_LOAD_OPEN ||
 		    spa_load_state(spa) == SPA_LOAD_IMPORT) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
 			    &vd->vdev_faulted);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
 			    &vd->vdev_degraded);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
 			    &vd->vdev_removed);
 
 			if (vd->vdev_faulted || vd->vdev_degraded) {
 				const char *aux;
 
 				vd->vdev_label_aux =
 				    VDEV_AUX_ERR_EXCEEDED;
 				if (nvlist_lookup_string(nv,
 				    ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
 				    strcmp(aux, "external") == 0)
 					vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 				else
 					vd->vdev_faulted = 0ULL;
 			}
 		}
 	}
 
 	/*
 	 * Add ourselves to the parent's list of children.
 	 */
 	vdev_add_child(parent, vd);
 
 	*vdp = vd;
 
 	return (0);
 }
 
 void
 vdev_free(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 	ASSERT3P(vd->vdev_trim_thread, ==, NULL);
 	ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
 	ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
 
 	/*
 	 * Scan queues are normally destroyed at the end of a scan. If the
 	 * queue exists here, that implies the vdev is being removed while
 	 * the scan is still running.
 	 */
 	if (vd->vdev_scan_io_queue != NULL) {
 		mutex_enter(&vd->vdev_scan_io_queue_lock);
 		dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
 		vd->vdev_scan_io_queue = NULL;
 		mutex_exit(&vd->vdev_scan_io_queue_lock);
 	}
 
 	/*
 	 * vdev_free() implies closing the vdev first.  This is simpler than
 	 * trying to ensure complicated semantics for all callers.
 	 */
 	vdev_close(vd);
 
 	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
 	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
 	/*
 	 * Free all children.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_free(vd->vdev_child[c]);
 
 	ASSERT(vd->vdev_child == NULL);
 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
 
 	if (vd->vdev_ops->vdev_op_fini != NULL)
 		vd->vdev_ops->vdev_op_fini(vd);
 
 	/*
 	 * Discard allocation state.
 	 */
 	if (vd->vdev_mg != NULL) {
 		vdev_metaslab_fini(vd);
 		metaslab_group_destroy(vd->vdev_mg);
 		vd->vdev_mg = NULL;
 	}
 	if (vd->vdev_log_mg != NULL) {
 		ASSERT0(vd->vdev_ms_count);
 		metaslab_group_destroy(vd->vdev_log_mg);
 		vd->vdev_log_mg = NULL;
 	}
 
 	ASSERT0(vd->vdev_stat.vs_space);
 	ASSERT0(vd->vdev_stat.vs_dspace);
 	ASSERT0(vd->vdev_stat.vs_alloc);
 
 	/*
 	 * Remove this vdev from its parent's child list.
 	 */
 	vdev_remove_child(vd->vdev_parent, vd);
 
 	ASSERT(vd->vdev_parent == NULL);
 	ASSERT(!list_link_active(&vd->vdev_leaf_node));
 
 	/*
 	 * Clean up vdev structure.
 	 */
 	vdev_queue_fini(vd);
 
 	if (vd->vdev_path)
 		spa_strfree(vd->vdev_path);
 	if (vd->vdev_devid)
 		spa_strfree(vd->vdev_devid);
 	if (vd->vdev_physpath)
 		spa_strfree(vd->vdev_physpath);
 
 	if (vd->vdev_enc_sysfs_path)
 		spa_strfree(vd->vdev_enc_sysfs_path);
 
 	if (vd->vdev_fru)
 		spa_strfree(vd->vdev_fru);
 
 	if (vd->vdev_isspare)
 		spa_spare_remove(vd);
 	if (vd->vdev_isl2cache)
 		spa_l2cache_remove(vd);
 
 	txg_list_destroy(&vd->vdev_ms_list);
 	txg_list_destroy(&vd->vdev_dtl_list);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	space_map_close(vd->vdev_dtl_sm);
 	for (int t = 0; t < DTL_TYPES; t++) {
 		zfs_range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
 		zfs_range_tree_destroy(vd->vdev_dtl[t]);
 	}
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	EQUIV(vd->vdev_indirect_births != NULL,
 	    vd->vdev_indirect_mapping != NULL);
 	if (vd->vdev_indirect_births != NULL) {
 		vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
 		vdev_indirect_births_close(vd->vdev_indirect_births);
 	}
 
 	if (vd->vdev_obsolete_sm != NULL) {
 		ASSERT(vd->vdev_removing ||
 		    vd->vdev_ops == &vdev_indirect_ops);
 		space_map_close(vd->vdev_obsolete_sm);
 		vd->vdev_obsolete_sm = NULL;
 	}
 	zfs_range_tree_destroy(vd->vdev_obsolete_segments);
 	rw_destroy(&vd->vdev_indirect_rwlock);
 	mutex_destroy(&vd->vdev_obsolete_lock);
 
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
 	mutex_destroy(&vd->vdev_scan_io_queue_lock);
 
 	mutex_destroy(&vd->vdev_initialize_lock);
 	mutex_destroy(&vd->vdev_initialize_io_lock);
 	cv_destroy(&vd->vdev_initialize_io_cv);
 	cv_destroy(&vd->vdev_initialize_cv);
 
 	mutex_destroy(&vd->vdev_trim_lock);
 	mutex_destroy(&vd->vdev_autotrim_lock);
 	mutex_destroy(&vd->vdev_trim_io_lock);
 	cv_destroy(&vd->vdev_trim_cv);
 	cv_destroy(&vd->vdev_autotrim_cv);
 	cv_destroy(&vd->vdev_autotrim_kick_cv);
 	cv_destroy(&vd->vdev_trim_io_cv);
 
 	mutex_destroy(&vd->vdev_rebuild_lock);
 	cv_destroy(&vd->vdev_rebuild_cv);
 
 	zfs_ratelimit_fini(&vd->vdev_delay_rl);
 	zfs_ratelimit_fini(&vd->vdev_deadman_rl);
 	zfs_ratelimit_fini(&vd->vdev_dio_verify_rl);
 	zfs_ratelimit_fini(&vd->vdev_checksum_rl);
 
 	if (vd == spa->spa_root_vdev)
 		spa->spa_root_vdev = NULL;
 
 	kmem_free(vd, sizeof (vdev_t));
 }
 
 /*
  * Transfer top-level vdev state from svd to tvd.
  */
 static void
 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 {
 	spa_t *spa = svd->vdev_spa;
 	metaslab_t *msp;
 	vdev_t *vd;
 	int t;
 
 	ASSERT(tvd == tvd->vdev_top);
 
 	tvd->vdev_ms_array = svd->vdev_ms_array;
 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
 	tvd->vdev_ms_count = svd->vdev_ms_count;
 	tvd->vdev_top_zap = svd->vdev_top_zap;
 
 	svd->vdev_ms_array = 0;
 	svd->vdev_ms_shift = 0;
 	svd->vdev_ms_count = 0;
 	svd->vdev_top_zap = 0;
 
 	if (tvd->vdev_mg)
 		ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
 	if (tvd->vdev_log_mg)
 		ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg);
 	tvd->vdev_mg = svd->vdev_mg;
 	tvd->vdev_log_mg = svd->vdev_log_mg;
 	tvd->vdev_ms = svd->vdev_ms;
 
 	svd->vdev_mg = NULL;
 	svd->vdev_log_mg = NULL;
 	svd->vdev_ms = NULL;
 
 	if (tvd->vdev_mg != NULL)
 		tvd->vdev_mg->mg_vd = tvd;
 	if (tvd->vdev_log_mg != NULL)
 		tvd->vdev_log_mg->mg_vd = tvd;
 
 	tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
 	svd->vdev_checkpoint_sm = NULL;
 
 	tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
 	svd->vdev_alloc_bias = VDEV_BIAS_NONE;
 
 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
 	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
 
 	svd->vdev_stat.vs_alloc = 0;
 	svd->vdev_stat.vs_space = 0;
 	svd->vdev_stat.vs_dspace = 0;
 
 	/*
 	 * State which may be set on a top-level vdev that's in the
 	 * process of being removed.
 	 */
 	ASSERT0(tvd->vdev_indirect_config.vic_births_object);
 	ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
 	ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
 	ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
 	ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
 	ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
 	ASSERT0(tvd->vdev_noalloc);
 	ASSERT0(tvd->vdev_removing);
 	ASSERT0(tvd->vdev_rebuilding);
 	tvd->vdev_noalloc = svd->vdev_noalloc;
 	tvd->vdev_removing = svd->vdev_removing;
 	tvd->vdev_rebuilding = svd->vdev_rebuilding;
 	tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
 	tvd->vdev_indirect_config = svd->vdev_indirect_config;
 	tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
 	tvd->vdev_indirect_births = svd->vdev_indirect_births;
 	zfs_range_tree_swap(&svd->vdev_obsolete_segments,
 	    &tvd->vdev_obsolete_segments);
 	tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
 	svd->vdev_indirect_config.vic_mapping_object = 0;
 	svd->vdev_indirect_config.vic_births_object = 0;
 	svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
 	svd->vdev_indirect_mapping = NULL;
 	svd->vdev_indirect_births = NULL;
 	svd->vdev_obsolete_sm = NULL;
 	svd->vdev_noalloc = 0;
 	svd->vdev_removing = 0;
 	svd->vdev_rebuilding = 0;
 
 	for (t = 0; t < TXG_SIZE; t++) {
 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
 		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
 	}
 
 	if (list_link_active(&svd->vdev_config_dirty_node)) {
 		vdev_config_clean(svd);
 		vdev_config_dirty(tvd);
 	}
 
 	if (list_link_active(&svd->vdev_state_dirty_node)) {
 		vdev_state_clean(svd);
 		vdev_state_dirty(tvd);
 	}
 
 	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
 	svd->vdev_deflate_ratio = 0;
 
 	tvd->vdev_islog = svd->vdev_islog;
 	svd->vdev_islog = 0;
 
 	dsl_scan_io_queue_vdev_xfer(svd, tvd);
 }
 
 static void
 vdev_top_update(vdev_t *tvd, vdev_t *vd)
 {
 	if (vd == NULL)
 		return;
 
 	vd->vdev_top = tvd;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_top_update(tvd, vd->vdev_child[c]);
 }
 
 /*
  * Add a mirror/replacing vdev above an existing vdev.  There is no need to
  * call .vdev_op_init() since mirror/replacing vdevs do not have private state.
  */
 vdev_t *
 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
 {
 	spa_t *spa = cvd->vdev_spa;
 	vdev_t *pvd = cvd->vdev_parent;
 	vdev_t *mvd;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
 
 	mvd->vdev_asize = cvd->vdev_asize;
 	mvd->vdev_min_asize = cvd->vdev_min_asize;
 	mvd->vdev_max_asize = cvd->vdev_max_asize;
 	mvd->vdev_psize = cvd->vdev_psize;
 	mvd->vdev_ashift = cvd->vdev_ashift;
 	mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
 	mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
 	mvd->vdev_state = cvd->vdev_state;
 	mvd->vdev_crtxg = cvd->vdev_crtxg;
 
 	vdev_remove_child(pvd, cvd);
 	vdev_add_child(pvd, mvd);
 	cvd->vdev_id = mvd->vdev_children;
 	vdev_add_child(mvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
 	if (mvd == mvd->vdev_top)
 		vdev_top_transfer(cvd, mvd);
 
 	return (mvd);
 }
 
 /*
  * Remove a 1-way mirror/replacing vdev from the tree.
  */
 void
 vdev_remove_parent(vdev_t *cvd)
 {
 	vdev_t *mvd = cvd->vdev_parent;
 	vdev_t *pvd = mvd->vdev_parent;
 
 	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	ASSERT(mvd->vdev_children == 1);
 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
 	    mvd->vdev_ops == &vdev_replacing_ops ||
 	    mvd->vdev_ops == &vdev_spare_ops);
 	cvd->vdev_ashift = mvd->vdev_ashift;
 	cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
 	cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
 	vdev_remove_child(mvd, cvd);
 	vdev_remove_child(pvd, mvd);
 
 	/*
 	 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
 	 * Otherwise, we could have detached an offline device, and when we
 	 * go to import the pool we'll think we have two top-level vdevs,
 	 * instead of a different version of the same top-level vdev.
 	 */
 	if (mvd->vdev_top == mvd) {
 		uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
 		cvd->vdev_orig_guid = cvd->vdev_guid;
 		cvd->vdev_guid += guid_delta;
 		cvd->vdev_guid_sum += guid_delta;
 
 		/*
 		 * If pool not set for autoexpand, we need to also preserve
 		 * mvd's asize to prevent automatic expansion of cvd.
 		 * Otherwise if we are adjusting the mirror by attaching and
 		 * detaching children of non-uniform sizes, the mirror could
 		 * autoexpand, unexpectedly requiring larger devices to
 		 * re-establish the mirror.
 		 */
 		if (!cvd->vdev_spa->spa_autoexpand)
 			cvd->vdev_asize = mvd->vdev_asize;
 	}
 	cvd->vdev_id = mvd->vdev_id;
 	vdev_add_child(pvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
 	if (cvd == cvd->vdev_top)
 		vdev_top_transfer(mvd, cvd);
 
 	ASSERT(mvd->vdev_children == 0);
 	vdev_free(mvd);
 }
 
 /*
  * Choose GCD for spa_gcd_alloc.
  */
 static uint64_t
 vdev_gcd(uint64_t a, uint64_t b)
 {
 	while (b != 0) {
 		uint64_t t = b;
 		b = a % b;
 		a = t;
 	}
 	return (a);
 }
 
 /*
  * Set spa_min_alloc and spa_gcd_alloc.
  */
 static void
 vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
 {
 	if (min_alloc < spa->spa_min_alloc)
 		spa->spa_min_alloc = min_alloc;
 	if (spa->spa_gcd_alloc == INT_MAX) {
 		spa->spa_gcd_alloc = min_alloc;
 	} else {
 		spa->spa_gcd_alloc = vdev_gcd(min_alloc,
 		    spa->spa_gcd_alloc);
 	}
 }
 
 void
 vdev_metaslab_group_create(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	/*
 	 * metaslab_group_create was delayed until allocation bias was available
 	 */
 	if (vd->vdev_mg == NULL) {
 		metaslab_class_t *mc;
 
 		if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
 			vd->vdev_alloc_bias = VDEV_BIAS_LOG;
 
 		ASSERT3U(vd->vdev_islog, ==,
 		    (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
 
 		switch (vd->vdev_alloc_bias) {
 		case VDEV_BIAS_LOG:
 			mc = spa_log_class(spa);
 			break;
 		case VDEV_BIAS_SPECIAL:
 			mc = spa_special_class(spa);
 			break;
 		case VDEV_BIAS_DEDUP:
 			mc = spa_dedup_class(spa);
 			break;
 		default:
 			mc = spa_normal_class(spa);
 		}
 
 		vd->vdev_mg = metaslab_group_create(mc, vd,
 		    spa->spa_alloc_count);
 
 		if (!vd->vdev_islog) {
 			vd->vdev_log_mg = metaslab_group_create(
 			    spa_embedded_log_class(spa), vd, 1);
 		}
 
 		/*
 		 * The spa ashift min/max only apply for the normal metaslab
 		 * class. Class destination is late binding so ashift boundary
 		 * setting had to wait until now.
 		 */
 		if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
 		    mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
 			if (vd->vdev_ashift > spa->spa_max_ashift)
 				spa->spa_max_ashift = vd->vdev_ashift;
 			if (vd->vdev_ashift < spa->spa_min_ashift)
 				spa->spa_min_ashift = vd->vdev_ashift;
 
 			uint64_t min_alloc = vdev_get_min_alloc(vd);
 			vdev_spa_set_alloc(spa, min_alloc);
 		}
 	}
 }
 
 int
 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t oldc = vd->vdev_ms_count;
 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
 	metaslab_t **mspp;
 	int error;
 	boolean_t expanding = (oldc != 0);
 
 	ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	/*
 	 * This vdev is not being allocated from yet or is a hole.
 	 */
 	if (vd->vdev_ms_shift == 0)
 		return (0);
 
 	ASSERT(!vd->vdev_ishole);
 
 	ASSERT(oldc <= newc);
 
 	mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 
 	if (expanding) {
 		memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp));
 		vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
 	}
 
 	vd->vdev_ms = mspp;
 	vd->vdev_ms_count = newc;
 
 	for (uint64_t m = oldc; m < newc; m++) {
 		uint64_t object = 0;
 		/*
 		 * vdev_ms_array may be 0 if we are creating the "fake"
 		 * metaslabs for an indirect vdev for zdb's leak detection.
 		 * See zdb_leak_init().
 		 */
 		if (txg == 0 && vd->vdev_ms_array != 0) {
 			error = dmu_read(spa->spa_meta_objset,
 			    vd->vdev_ms_array,
 			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
 			    DMU_READ_PREFETCH);
 			if (error != 0) {
 				vdev_dbgmsg(vd, "unable to read the metaslab "
 				    "array [error=%d]", error);
 				return (error);
 			}
 		}
 
 		error = metaslab_init(vd->vdev_mg, m, object, txg,
 		    &(vd->vdev_ms[m]));
 		if (error != 0) {
 			vdev_dbgmsg(vd, "metaslab_init failed [error=%d]",
 			    error);
 			return (error);
 		}
 	}
 
 	/*
 	 * Find the emptiest metaslab on the vdev and mark it for use for
 	 * embedded slog by moving it from the regular to the log metaslab
 	 * group.
 	 */
 	if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
 	    vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
 	    avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
 		uint64_t slog_msid = 0;
 		uint64_t smallest = UINT64_MAX;
 
 		/*
 		 * Note, we only search the new metaslabs, because the old
 		 * (pre-existing) ones may be active (e.g. have non-empty
 		 * range_tree's), and we don't move them to the new
 		 * metaslab_t.
 		 */
 		for (uint64_t m = oldc; m < newc; m++) {
 			uint64_t alloc =
 			    space_map_allocated(vd->vdev_ms[m]->ms_sm);
 			if (alloc < smallest) {
 				slog_msid = m;
 				smallest = alloc;
 			}
 		}
 		metaslab_t *slog_ms = vd->vdev_ms[slog_msid];
 		/*
 		 * The metaslab was marked as dirty at the end of
 		 * metaslab_init(). Remove it from the dirty list so that we
 		 * can uninitialize and reinitialize it to the new class.
 		 */
 		if (txg != 0) {
 			(void) txg_list_remove_this(&vd->vdev_ms_list,
 			    slog_ms, txg);
 		}
 		uint64_t sm_obj = space_map_object(slog_ms->ms_sm);
 		metaslab_fini(slog_ms);
 		VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg,
 		    &vd->vdev_ms[slog_msid]));
 	}
 
 	if (txg == 0)
 		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 
 	/*
 	 * If the vdev is marked as non-allocating then don't
 	 * activate the metaslabs since we want to ensure that
 	 * no allocations are performed on this device.
 	 */
 	if (vd->vdev_noalloc) {
 		/* track non-allocating vdev space */
 		spa->spa_nonallocating_dspace += spa_deflate(spa) ?
 		    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
 	} else if (!expanding) {
 		metaslab_group_activate(vd->vdev_mg);
 		if (vd->vdev_log_mg != NULL)
 			metaslab_group_activate(vd->vdev_log_mg);
 	}
 
 	if (txg == 0)
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	return (0);
 }
 
 void
 vdev_metaslab_fini(vdev_t *vd)
 {
 	if (vd->vdev_checkpoint_sm != NULL) {
 		ASSERT(spa_feature_is_active(vd->vdev_spa,
 		    SPA_FEATURE_POOL_CHECKPOINT));
 		space_map_close(vd->vdev_checkpoint_sm);
 		/*
 		 * Even though we close the space map, we need to set its
 		 * pointer to NULL. The reason is that vdev_metaslab_fini()
 		 * may be called multiple times for certain operations
 		 * (i.e. when destroying a pool) so we need to ensure that
 		 * this clause never executes twice. This logic is similar
 		 * to the one used for the vdev_ms clause below.
 		 */
 		vd->vdev_checkpoint_sm = NULL;
 	}
 
 	if (vd->vdev_ms != NULL) {
 		metaslab_group_t *mg = vd->vdev_mg;
 
 		metaslab_group_passivate(mg);
 		if (vd->vdev_log_mg != NULL) {
 			ASSERT(!vd->vdev_islog);
 			metaslab_group_passivate(vd->vdev_log_mg);
 		}
 
 		uint64_t count = vd->vdev_ms_count;
 		for (uint64_t m = 0; m < count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 			if (msp != NULL)
 				metaslab_fini(msp);
 		}
 		vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 		vd->vdev_ms = NULL;
 		vd->vdev_ms_count = 0;
 
-		for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+		for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
 			ASSERT0(mg->mg_histogram[i]);
 			if (vd->vdev_log_mg != NULL)
 				ASSERT0(vd->vdev_log_mg->mg_histogram[i]);
 		}
 	}
 	ASSERT0(vd->vdev_ms_count);
 }
 
 typedef struct vdev_probe_stats {
 	boolean_t	vps_readable;
 	boolean_t	vps_writeable;
 	boolean_t	vps_zio_done_probe;
 	int		vps_flags;
 } vdev_probe_stats_t;
 
 static void
 vdev_probe_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *vd = zio->io_vd;
 	vdev_probe_stats_t *vps = zio->io_private;
 
 	ASSERT(vd->vdev_probe_zio != NULL);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if (zio->io_error == 0)
 			vps->vps_readable = 1;
 		if (zio->io_error == 0 && spa_writeable(spa)) {
 			zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
 			    zio->io_offset, zio->io_size, zio->io_abd,
 			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
 		} else {
 			abd_free(zio->io_abd);
 		}
 	} else if (zio->io_type == ZIO_TYPE_WRITE) {
 		if (zio->io_error == 0)
 			vps->vps_writeable = 1;
 		abd_free(zio->io_abd);
 	} else if (zio->io_type == ZIO_TYPE_NULL) {
 		zio_t *pio;
 		zio_link_t *zl;
 
 		vd->vdev_cant_read |= !vps->vps_readable;
 		vd->vdev_cant_write |= !vps->vps_writeable;
 		vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u",
 		    vd->vdev_cant_read, vd->vdev_cant_write);
 
 		if (vdev_readable(vd) &&
 		    (vdev_writeable(vd) || !spa_writeable(spa))) {
 			zio->io_error = 0;
 		} else {
 			ASSERT(zio->io_error != 0);
 			vdev_dbgmsg(vd, "failed probe");
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
 			    spa, vd, NULL, NULL, 0);
 			zio->io_error = SET_ERROR(ENXIO);
 
 			/*
 			 * If this probe was initiated from zio pipeline, then
 			 * change the state in a spa_async_request. Probes that
 			 * were initiated from a vdev_open can change the state
 			 * as part of the open call.
 			 */
 			if (vps->vps_zio_done_probe) {
 				vd->vdev_fault_wanted = B_TRUE;
 				spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
 			}
 		}
 
 		mutex_enter(&vd->vdev_probe_lock);
 		ASSERT(vd->vdev_probe_zio == zio);
 		vd->vdev_probe_zio = NULL;
 		mutex_exit(&vd->vdev_probe_lock);
 
 		zl = NULL;
 		while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 			if (!vdev_accessible(vd, pio))
 				pio->io_error = SET_ERROR(ENXIO);
 
 		kmem_free(vps, sizeof (*vps));
 	}
 }
 
 /*
  * Determine whether this device is accessible.
  *
  * Read and write to several known locations: the pad regions of each
  * vdev label but the first, which we leave alone in case it contains
  * a VTOC.
  */
 zio_t *
 vdev_probe(vdev_t *vd, zio_t *zio)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_probe_stats_t *vps = NULL;
 	zio_t *pio;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	/*
 	 * Don't probe the probe.
 	 */
 	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
 		return (NULL);
 
 	/*
 	 * To prevent 'probe storms' when a device fails, we create
 	 * just one probe i/o at a time.  All zios that want to probe
 	 * this vdev will become parents of the probe io.
 	 */
 	mutex_enter(&vd->vdev_probe_lock);
 
 	if ((pio = vd->vdev_probe_zio) == NULL) {
 		vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
 
 		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
 		    ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
 		vps->vps_zio_done_probe = (zio != NULL);
 
 		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
 			/*
 			 * vdev_cant_read and vdev_cant_write can only
 			 * transition from TRUE to FALSE when we have the
 			 * SCL_ZIO lock as writer; otherwise they can only
 			 * transition from FALSE to TRUE.  This ensures that
 			 * any zio looking at these values can assume that
 			 * failures persist for the life of the I/O.  That's
 			 * important because when a device has intermittent
 			 * connectivity problems, we want to ensure that
 			 * they're ascribed to the device (ENXIO) and not
 			 * the zio (EIO).
 			 *
 			 * Since we hold SCL_ZIO as writer here, clear both
 			 * values so the probe can reevaluate from first
 			 * principles.
 			 */
 			vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
 			vd->vdev_cant_read = B_FALSE;
 			vd->vdev_cant_write = B_FALSE;
 		}
 
 		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
 		    vdev_probe_done, vps,
 		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
 	}
 
 	if (zio != NULL)
 		zio_add_child(zio, pio);
 
 	mutex_exit(&vd->vdev_probe_lock);
 
 	if (vps == NULL) {
 		ASSERT(zio != NULL);
 		return (NULL);
 	}
 
 	for (int l = 1; l < VDEV_LABELS; l++) {
 		zio_nowait(zio_read_phys(pio, vd,
 		    vdev_label_offset(vd->vdev_psize, l,
 		    offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE,
 		    abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
 		    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
 	}
 
 	if (zio == NULL)
 		return (pio);
 
 	zio_nowait(pio);
 	return (NULL);
 }
 
 static void
 vdev_load_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_load_error = vdev_load(vd);
 }
 
 static void
 vdev_open_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_open_thread = curthread;
 	vd->vdev_open_error = vdev_open(vd);
 	vd->vdev_open_thread = NULL;
 }
 
 static boolean_t
 vdev_uses_zvols(vdev_t *vd)
 {
 #ifdef _KERNEL
 	if (zvol_is_zvol(vd->vdev_path))
 		return (B_TRUE);
 #endif
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_uses_zvols(vd->vdev_child[c]))
 			return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 /*
  * Returns B_TRUE if the passed child should be opened.
  */
 static boolean_t
 vdev_default_open_children_func(vdev_t *vd)
 {
 	(void) vd;
 	return (B_TRUE);
 }
 
 /*
  * Open the requested child vdevs.  If any of the leaf vdevs are using
  * a ZFS volume then do the opens in a single thread.  This avoids a
  * deadlock when the current thread is holding the spa_namespace_lock.
  */
 static void
 vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
 {
 	int children = vd->vdev_children;
 
 	taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
 	    children, children, TASKQ_PREPOPULATE);
 	vd->vdev_nonrot = B_TRUE;
 
 	for (int c = 0; c < children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (open_func(cvd) == B_FALSE)
 			continue;
 
 		if (tq == NULL || vdev_uses_zvols(vd)) {
 			cvd->vdev_open_error = vdev_open(cvd);
 		} else {
 			VERIFY(taskq_dispatch(tq, vdev_open_child,
 			    cvd, TQ_SLEEP) != TASKQID_INVALID);
 		}
 
 		vd->vdev_nonrot &= cvd->vdev_nonrot;
 	}
 
 	if (tq != NULL) {
 		taskq_wait(tq);
 		taskq_destroy(tq);
 	}
 }
 
 /*
  * Open all child vdevs.
  */
 void
 vdev_open_children(vdev_t *vd)
 {
 	vdev_open_children_impl(vd, vdev_default_open_children_func);
 }
 
 /*
  * Conditionally open a subset of child vdevs.
  */
 void
 vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
 {
 	vdev_open_children_impl(vd, open_func);
 }
 
 /*
  * Compute the raidz-deflation ratio.  Note, we hard-code 128k (1 << 17)
  * because it is the "typical" blocksize.  Even though SPA_MAXBLOCKSIZE
  * changed, this algorithm can not change, otherwise it would inconsistently
  * account for existing bp's.  We also hard-code txg 0 for the same reason
  * since expanded RAIDZ vdevs can use a different asize for different birth
  * txg's.
  */
 static void
 vdev_set_deflate_ratio(vdev_t *vd)
 {
 	if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
 		vd->vdev_deflate_ratio = (1 << 17) /
 		    (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >>
 		    SPA_MINBLOCKSHIFT);
 	}
 }
 
 /*
  * Choose the best of two ashifts, preferring one between logical ashift
  * (absolute minimum) and administrator defined maximum, otherwise take
  * the biggest of the two.
  */
 uint64_t
 vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b)
 {
 	if (a > logical && a <= zfs_vdev_max_auto_ashift) {
 		if (b <= logical || b > zfs_vdev_max_auto_ashift)
 			return (a);
 		else
 			return (MAX(a, b));
 	} else if (b <= logical || b > zfs_vdev_max_auto_ashift)
 		return (MAX(a, b));
 	return (b);
 }
 
 /*
  * Maximize performance by inflating the configured ashift for top level
  * vdevs to be as close to the physical ashift as possible while maintaining
  * administrator defined limits and ensuring it doesn't go below the
  * logical ashift.
  */
 static void
 vdev_ashift_optimize(vdev_t *vd)
 {
 	ASSERT(vd == vd->vdev_top);
 
 	if (vd->vdev_ashift < vd->vdev_physical_ashift &&
 	    vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) {
 		vd->vdev_ashift = MIN(
 		    MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
 		    MAX(zfs_vdev_min_auto_ashift,
 		    vd->vdev_physical_ashift));
 	} else {
 		/*
 		 * If the logical and physical ashifts are the same, then
 		 * we ensure that the top-level vdev's ashift is not smaller
 		 * than our minimum ashift value. For the unusual case
 		 * where logical ashift > physical ashift, we can't cap
 		 * the calculated ashift based on max ashift as that
 		 * would cause failures.
 		 * We still check if we need to increase it to match
 		 * the min ashift.
 		 */
 		vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
 		    vd->vdev_ashift);
 	}
 }
 
 /*
  * Prepare a virtual device for access.
  */
 int
 vdev_open(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	int error;
 	uint64_t osize = 0;
 	uint64_t max_osize = 0;
 	uint64_t asize, max_asize, psize;
 	uint64_t logical_ashift = 0;
 	uint64_t physical_ashift = 0;
 
 	ASSERT(vd->vdev_open_thread == curthread ||
 	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
 	    vd->vdev_state == VDEV_STATE_OFFLINE);
 
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	vd->vdev_cant_read = B_FALSE;
 	vd->vdev_cant_write = B_FALSE;
 	vd->vdev_fault_wanted = B_FALSE;
 	vd->vdev_remove_wanted = B_FALSE;
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	/*
 	 * If this vdev is not removed, check its fault status.  If it's
 	 * faulted, bail out of the open.
 	 */
 	if (!vd->vdev_removed && vd->vdev_faulted) {
 		ASSERT(vd->vdev_children == 0);
 		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
 		return (SET_ERROR(ENXIO));
 	} else if (vd->vdev_offline) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
 		return (SET_ERROR(ENXIO));
 	}
 
 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
 	    &logical_ashift, &physical_ashift);
 
 	/* Keep the device in removed state if unplugged */
 	if (error == ENOENT && vd->vdev_removed) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED,
 		    VDEV_AUX_NONE);
 		return (error);
 	}
 
 	/*
 	 * Physical volume size should never be larger than its max size, unless
 	 * the disk has shrunk while we were reading it or the device is buggy
 	 * or damaged: either way it's not safe for use, bail out of the open.
 	 */
 	if (osize > max_osize) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_OPEN_FAILED);
 		return (SET_ERROR(ENXIO));
 	}
 
 	/*
 	 * Reset the vdev_reopening flag so that we actually close
 	 * the vdev on error.
 	 */
 	vd->vdev_reopening = B_FALSE;
 	if (zio_injection_enabled && error == 0)
 		error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO));
 
 	if (error) {
 		if (vd->vdev_removed &&
 		    vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
 			vd->vdev_removed = B_FALSE;
 
 		if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
 			    vd->vdev_stat.vs_aux);
 		} else {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    vd->vdev_stat.vs_aux);
 		}
 		return (error);
 	}
 
 	vd->vdev_removed = B_FALSE;
 
 	/*
 	 * Recheck the faulted flag now that we have confirmed that
 	 * the vdev is accessible.  If we're faulted, bail.
 	 */
 	if (vd->vdev_faulted) {
 		ASSERT(vd->vdev_children == 0);
 		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (vd->vdev_degraded) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 		    VDEV_AUX_ERR_EXCEEDED);
 	} else {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
 	}
 
 	/*
 	 * For hole or missing vdevs we just return success.
 	 */
 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
 		return (0);
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 			    VDEV_AUX_NONE);
 			break;
 		}
 	}
 
 	osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t);
 	max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t);
 
 	if (vd->vdev_children == 0) {
 		if (osize < SPA_MINDEVSIZE) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
 			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = osize;
 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
 		max_asize = max_osize - (VDEV_LABEL_START_SIZE +
 		    VDEV_LABEL_END_SIZE);
 	} else {
 		if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
 			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = 0;
 		asize = osize;
 		max_asize = max_osize;
 	}
 
 	/*
 	 * If the vdev was expanded, record this so that we can re-create the
 	 * uberblock rings in labels {2,3}, during the next sync.
 	 */
 	if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0))
 		vd->vdev_copy_uberblocks = B_TRUE;
 
 	vd->vdev_psize = psize;
 
 	/*
 	 * Make sure the allocatable size hasn't shrunk too much.
 	 */
 	if (asize < vd->vdev_min_asize) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_LABEL);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * We can always set the logical/physical ashift members since
 	 * their values are only used to calculate the vdev_ashift when
 	 * the device is first added to the config. These values should
 	 * not be used for anything else since they may change whenever
 	 * the device is reopened and we don't store them in the label.
 	 */
 	vd->vdev_physical_ashift =
 	    MAX(physical_ashift, vd->vdev_physical_ashift);
 	vd->vdev_logical_ashift = MAX(logical_ashift,
 	    vd->vdev_logical_ashift);
 
 	if (vd->vdev_asize == 0) {
 		/*
 		 * This is the first-ever open, so use the computed values.
 		 * For compatibility, a different ashift can be requested.
 		 */
 		vd->vdev_asize = asize;
 		vd->vdev_max_asize = max_asize;
 
 		/*
 		 * If the vdev_ashift was not overridden at creation time
 		 * (0) or the override value is impossible for the device,
 		 * then set it the logical ashift and optimize the ashift.
 		 */
 		if (vd->vdev_ashift < vd->vdev_logical_ashift) {
 			vd->vdev_ashift = vd->vdev_logical_ashift;
 
 			if (vd->vdev_logical_ashift > ASHIFT_MAX) {
 				vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 				    VDEV_AUX_ASHIFT_TOO_BIG);
 				return (SET_ERROR(EDOM));
 			}
 
 			if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE)
 				vdev_ashift_optimize(vd);
 			vd->vdev_attaching = B_FALSE;
 		}
 		if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
 		    vd->vdev_ashift > ASHIFT_MAX)) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_ASHIFT);
 			return (SET_ERROR(EDOM));
 		}
 	} else {
 		/*
 		 * Make sure the alignment required hasn't increased.
 		 */
 		if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
 		    vd->vdev_ops->vdev_op_leaf) {
 			(void) zfs_ereport_post(
 			    FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
 			    spa, vd, NULL, NULL, 0);
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_LABEL);
 			return (SET_ERROR(EDOM));
 		}
 		vd->vdev_max_asize = max_asize;
 	}
 
 	/*
 	 * If all children are healthy we update asize if either:
 	 * The asize has increased, due to a device expansion caused by dynamic
 	 * LUN growth or vdev replacement, and automatic expansion is enabled;
 	 * making the additional space available.
 	 *
 	 * The asize has decreased, due to a device shrink usually caused by a
 	 * vdev replace with a smaller device. This ensures that calculations
 	 * based of max_asize and asize e.g. esize are always valid. It's safe
 	 * to do this as we've already validated that asize is greater than
 	 * vdev_min_asize.
 	 */
 	if (vd->vdev_state == VDEV_STATE_HEALTHY &&
 	    ((asize > vd->vdev_asize &&
 	    (vd->vdev_expanding || spa->spa_autoexpand)) ||
 	    (asize < vd->vdev_asize)))
 		vd->vdev_asize = asize;
 
 	vdev_set_min_asize(vd);
 
 	/*
 	 * Ensure we can issue some IO before declaring the
 	 * vdev open for business.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    VDEV_AUX_ERR_EXCEEDED);
 		return (error);
 	}
 
 	/*
 	 * Track the minimum allocation size.
 	 */
 	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
 	    vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
 		uint64_t min_alloc = vdev_get_min_alloc(vd);
 		vdev_spa_set_alloc(spa, min_alloc);
 	}
 
 	/*
 	 * If this is a leaf vdev, assess whether a resilver is needed.
 	 * But don't do this if we are doing a reopen for a scrub, since
 	 * this would just restart the scrub we are already doing.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen)
 		dsl_scan_assess_vdev(spa->spa_dsl_pool, vd);
 
 	return (0);
 }
 
 static void
 vdev_validate_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_validate_thread = curthread;
 	vd->vdev_validate_error = vdev_validate(vd);
 	vd->vdev_validate_thread = NULL;
 }
 
 /*
  * Called once the vdevs are all opened, this routine validates the label
  * contents. This needs to be done before vdev_load() so that we don't
  * inadvertently do repair I/Os to the wrong device.
  *
  * This function will only return failure if one of the vdevs indicates that it
  * has since been destroyed or exported.  This is only possible if
  * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
  * will be updated but the function will return 0.
  */
 int
 vdev_validate(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	taskq_t *tq = NULL;
 	nvlist_t *label;
 	uint64_t guid = 0, aux_guid = 0, top_guid;
 	uint64_t state;
 	nvlist_t *nvl;
 	uint64_t txg;
 	int children = vd->vdev_children;
 
 	if (vdev_validate_skip)
 		return (0);
 
 	if (children > 0) {
 		tq = taskq_create("vdev_validate", children, minclsyspri,
 		    children, children, TASKQ_PREPOPULATE);
 	}
 
 	for (uint64_t c = 0; c < children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (tq == NULL || vdev_uses_zvols(cvd)) {
 			vdev_validate_child(cvd);
 		} else {
 			VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd,
 			    TQ_SLEEP) != TASKQID_INVALID);
 		}
 	}
 	if (tq != NULL) {
 		taskq_wait(tq);
 		taskq_destroy(tq);
 	}
 	for (int c = 0; c < children; c++) {
 		int error = vd->vdev_child[c]->vdev_validate_error;
 
 		if (error != 0)
 			return (SET_ERROR(EBADF));
 	}
 
 
 	/*
 	 * If the device has already failed, or was marked offline, don't do
 	 * any further validation.  Otherwise, label I/O will fail and we will
 	 * overwrite the previous state.
 	 */
 	if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
 		return (0);
 
 	/*
 	 * If we are performing an extreme rewind, we allow for a label that
 	 * was modified at a point after the current txg.
 	 * If config lock is not held do not check for the txg. spa_sync could
 	 * be updating the vdev's label before updating spa_last_synced_txg.
 	 */
 	if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 ||
 	    spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG)
 		txg = UINT64_MAX;
 	else
 		txg = spa_last_synced_txg(spa);
 
 	if ((label = vdev_label_read_config(vd, txg)) == NULL) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_LABEL);
 		vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
 		    "txg %llu", (u_longlong_t)txg);
 		return (0);
 	}
 
 	/*
 	 * Determine if this vdev has been split off into another
 	 * pool.  If so, then refuse to open it.
 	 */
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
 	    &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_SPLIT_POOL);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
 		return (0);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_POOL_GUID);
 		return (0);
 	}
 
 	/*
 	 * If config is not trusted then ignore the spa guid check. This is
 	 * necessary because if the machine crashed during a re-guid the new
 	 * guid might have been written to all of the vdev labels, but not the
 	 * cached config. The check will be performed again once we have the
 	 * trusted config from the MOS.
 	 */
 	if (spa->spa_trust_config && guid != spa_guid(spa)) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
 		    "match config (%llu != %llu)", (u_longlong_t)guid,
 		    (u_longlong_t)spa_guid(spa));
 		return (0);
 	}
 
 	if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
 	    != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
 	    &aux_guid) != 0)
 		aux_guid = 0;
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_GUID);
 		return (0);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
 	    != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_TOP_GUID);
 		return (0);
 	}
 
 	/*
 	 * If this vdev just became a top-level vdev because its sibling was
 	 * detached, it will have adopted the parent's vdev guid -- but the
 	 * label may or may not be on disk yet. Fortunately, either version
 	 * of the label will have the same top guid, so if we're a top-level
 	 * vdev, we can safely compare to that instead.
 	 * However, if the config comes from a cachefile that failed to update
 	 * after the detach, a top-level vdev will appear as a non top-level
 	 * vdev in the config. Also relax the constraints if we perform an
 	 * extreme rewind.
 	 *
 	 * If we split this vdev off instead, then we also check the
 	 * original pool's guid. We don't want to consider the vdev
 	 * corrupt if it is partway through a split operation.
 	 */
 	if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
 		boolean_t mismatch = B_FALSE;
 		if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
 			if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
 				mismatch = B_TRUE;
 		} else {
 			if (vd->vdev_guid != top_guid &&
 			    vd->vdev_top->vdev_guid != guid)
 				mismatch = B_TRUE;
 		}
 
 		if (mismatch) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			vdev_dbgmsg(vd, "vdev_validate: config guid "
 			    "doesn't match label guid");
 			vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
 			    (u_longlong_t)vd->vdev_guid,
 			    (u_longlong_t)vd->vdev_top->vdev_guid);
 			vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
 			    "aux_guid %llu", (u_longlong_t)guid,
 			    (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
 			return (0);
 		}
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 	    &state) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_POOL_STATE);
 		return (0);
 	}
 
 	nvlist_free(label);
 
 	/*
 	 * If this is a verbatim import, no need to check the
 	 * state of the pool.
 	 */
 	if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
 	    spa_load_state(spa) == SPA_LOAD_OPEN &&
 	    state != POOL_STATE_ACTIVE) {
 		vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
 		    "for spa %s", (u_longlong_t)state, spa->spa_name);
 		return (SET_ERROR(EBADF));
 	}
 
 	/*
 	 * If we were able to open and validate a vdev that was
 	 * previously marked permanently unavailable, clear that state
 	 * now.
 	 */
 	if (vd->vdev_not_present)
 		vd->vdev_not_present = 0;
 
 	return (0);
 }
 
 static void
 vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid)
 {
 	if (svd != NULL && *dvd != NULL) {
 		if (strcmp(svd, *dvd) != 0) {
 			zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed "
 			    "from '%s' to '%s'", (u_longlong_t)guid, prefix,
 			    *dvd, svd);
 			spa_strfree(*dvd);
 			*dvd = spa_strdup(svd);
 		}
 	} else if (svd != NULL) {
 		*dvd = spa_strdup(svd);
 		zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
 		    (u_longlong_t)guid, *dvd);
 	}
 }
 
 static void
 vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
 {
 	char *old, *new;
 
 	vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path,
 	    dvd->vdev_guid);
 
 	vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid,
 	    dvd->vdev_guid);
 
 	vdev_update_path("vdev_physpath", svd->vdev_physpath,
 	    &dvd->vdev_physpath, dvd->vdev_guid);
 
 	/*
 	 * Our enclosure sysfs path may have changed between imports
 	 */
 	old = dvd->vdev_enc_sysfs_path;
 	new = svd->vdev_enc_sysfs_path;
 	if ((old != NULL && new == NULL) ||
 	    (old == NULL && new != NULL) ||
 	    ((old != NULL && new != NULL) && strcmp(new, old) != 0)) {
 		zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path "
 		    "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
 		    old, new);
 
 		if (dvd->vdev_enc_sysfs_path)
 			spa_strfree(dvd->vdev_enc_sysfs_path);
 
 		if (svd->vdev_enc_sysfs_path) {
 			dvd->vdev_enc_sysfs_path = spa_strdup(
 			    svd->vdev_enc_sysfs_path);
 		} else {
 			dvd->vdev_enc_sysfs_path = NULL;
 		}
 	}
 }
 
 /*
  * Recursively copy vdev paths from one vdev to another. Source and destination
  * vdev trees must have same geometry otherwise return error. Intended to copy
  * paths from userland config into MOS config.
  */
 int
 vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
 {
 	if ((svd->vdev_ops == &vdev_missing_ops) ||
 	    (svd->vdev_ishole && dvd->vdev_ishole) ||
 	    (dvd->vdev_ops == &vdev_indirect_ops))
 		return (0);
 
 	if (svd->vdev_ops != dvd->vdev_ops) {
 		vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
 		    svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (svd->vdev_guid != dvd->vdev_guid) {
 		vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
 		    "%llu)", (u_longlong_t)svd->vdev_guid,
 		    (u_longlong_t)dvd->vdev_guid);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (svd->vdev_children != dvd->vdev_children) {
 		vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
 		    "%llu != %llu", (u_longlong_t)svd->vdev_children,
 		    (u_longlong_t)dvd->vdev_children);
 		return (SET_ERROR(EINVAL));
 	}
 
 	for (uint64_t i = 0; i < svd->vdev_children; i++) {
 		int error = vdev_copy_path_strict(svd->vdev_child[i],
 		    dvd->vdev_child[i]);
 		if (error != 0)
 			return (error);
 	}
 
 	if (svd->vdev_ops->vdev_op_leaf)
 		vdev_copy_path_impl(svd, dvd);
 
 	return (0);
 }
 
 static void
 vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
 {
 	ASSERT(stvd->vdev_top == stvd);
 	ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
 
 	for (uint64_t i = 0; i < dvd->vdev_children; i++) {
 		vdev_copy_path_search(stvd, dvd->vdev_child[i]);
 	}
 
 	if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
 		return;
 
 	/*
 	 * The idea here is that while a vdev can shift positions within
 	 * a top vdev (when replacing, attaching mirror, etc.) it cannot
 	 * step outside of it.
 	 */
 	vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
 
 	if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
 		return;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	vdev_copy_path_impl(vd, dvd);
 }
 
 /*
  * Recursively copy vdev paths from one root vdev to another. Source and
  * destination vdev trees may differ in geometry. For each destination leaf
  * vdev, search a vdev with the same guid and top vdev id in the source.
  * Intended to copy paths from userland config into MOS config.
  */
 void
 vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
 {
 	uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
 	ASSERT(srvd->vdev_ops == &vdev_root_ops);
 	ASSERT(drvd->vdev_ops == &vdev_root_ops);
 
 	for (uint64_t i = 0; i < children; i++) {
 		vdev_copy_path_search(srvd->vdev_child[i],
 		    drvd->vdev_child[i]);
 	}
 }
 
 /*
  * Close a virtual device.
  */
 void
 vdev_close(vdev_t *vd)
 {
 	vdev_t *pvd = vd->vdev_parent;
 	spa_t *spa __maybe_unused = vd->vdev_spa;
 
 	ASSERT(vd != NULL);
 	ASSERT(vd->vdev_open_thread == curthread ||
 	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	/*
 	 * If our parent is reopening, then we are as well, unless we are
 	 * going offline.
 	 */
 	if (pvd != NULL && pvd->vdev_reopening)
 		vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
 
 	vd->vdev_ops->vdev_op_close(vd);
 
 	/*
 	 * We record the previous state before we close it, so that if we are
 	 * doing a reopen(), we don't generate FMA ereports if we notice that
 	 * it's still faulted.
 	 */
 	vd->vdev_prevstate = vd->vdev_state;
 
 	if (vd->vdev_offline)
 		vd->vdev_state = VDEV_STATE_OFFLINE;
 	else
 		vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 }
 
 void
 vdev_hold(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_is_root(spa));
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
 		return;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_hold(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL)
 		vd->vdev_ops->vdev_op_hold(vd);
 }
 
 void
 vdev_rele(vdev_t *vd)
 {
 	ASSERT(spa_is_root(vd->vdev_spa));
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_rele(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL)
 		vd->vdev_ops->vdev_op_rele(vd);
 }
 
 /*
  * Reopen all interior vdevs and any unopened leaves.  We don't actually
  * reopen leaf vdevs which had previously been opened as they might deadlock
  * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
  * If the leaf has never been opened then open it, as usual.
  */
 void
 vdev_reopen(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	/* set the reopening flag unless we're taking the vdev offline */
 	vd->vdev_reopening = !vd->vdev_offline;
 	vdev_close(vd);
 	(void) vdev_open(vd);
 
 	/*
 	 * Call vdev_validate() here to make sure we have the same device.
 	 * Otherwise, a device with an invalid label could be successfully
 	 * opened in response to vdev_reopen().
 	 */
 	if (vd->vdev_aux) {
 		(void) vdev_validate_aux(vd);
 		if (vdev_readable(vd) && vdev_writeable(vd) &&
 		    vd->vdev_aux == &spa->spa_l2cache) {
 			/*
 			 * In case the vdev is present we should evict all ARC
 			 * buffers and pointers to log blocks and reclaim their
 			 * space before restoring its contents to L2ARC.
 			 */
 			if (l2arc_vdev_present(vd)) {
 				l2arc_rebuild_vdev(vd, B_TRUE);
 			} else {
 				l2arc_add_vdev(spa, vd);
 			}
 			spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
 			spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
 		}
 	} else {
 		(void) vdev_validate(vd);
 	}
 
 	/*
 	 * Recheck if resilver is still needed and cancel any
 	 * scheduled resilver if resilver is unneeded.
 	 */
 	if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) &&
 	    spa->spa_async_tasks & SPA_ASYNC_RESILVER) {
 		mutex_enter(&spa->spa_async_lock);
 		spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER;
 		mutex_exit(&spa->spa_async_lock);
 	}
 
 	/*
 	 * Reassess parent vdev's health.
 	 */
 	vdev_propagate_state(vd);
 }
 
 int
 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
 {
 	int error;
 
 	/*
 	 * Normally, partial opens (e.g. of a mirror) are allowed.
 	 * For a create, however, we want to fail the request if
 	 * there are any components we can't open.
 	 */
 	error = vdev_open(vd);
 
 	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
 		vdev_close(vd);
 		return (error ? error : SET_ERROR(ENXIO));
 	}
 
 	/*
 	 * Recursively load DTLs and initialize all labels.
 	 */
 	if ((error = vdev_dtl_load(vd)) != 0 ||
 	    (error = vdev_label_init(vd, txg, isreplacing ?
 	    VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
 		vdev_close(vd);
 		return (error);
 	}
 
 	return (0);
 }
 
 void
 vdev_metaslab_set_size(vdev_t *vd)
 {
 	uint64_t asize = vd->vdev_asize;
 	uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
 	uint64_t ms_shift;
 
 	/*
 	 * There are two dimensions to the metaslab sizing calculation:
 	 * the size of the metaslab and the count of metaslabs per vdev.
 	 *
 	 * The default values used below are a good balance between memory
 	 * usage (larger metaslab size means more memory needed for loaded
 	 * metaslabs; more metaslabs means more memory needed for the
 	 * metaslab_t structs), metaslab load time (larger metaslabs take
 	 * longer to load), and metaslab sync time (more metaslabs means
 	 * more time spent syncing all of them).
 	 *
 	 * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
 	 * The range of the dimensions are as follows:
 	 *
 	 *	2^29 <= ms_size  <= 2^34
 	 *	  16 <= ms_count <= 131,072
 	 *
 	 * On the lower end of vdev sizes, we aim for metaslabs sizes of
 	 * at least 512MB (2^29) to minimize fragmentation effects when
 	 * testing with smaller devices.  However, the count constraint
 	 * of at least 16 metaslabs will override this minimum size goal.
 	 *
 	 * On the upper end of vdev sizes, we aim for a maximum metaslab
 	 * size of 16GB.  However, we will cap the total count to 2^17
 	 * metaslabs to keep our memory footprint in check and let the
 	 * metaslab size grow from there if that limit is hit.
 	 *
 	 * The net effect of applying above constrains is summarized below.
 	 *
 	 *   vdev size       metaslab count
 	 *  --------------|-----------------
 	 *      < 8GB        ~16
 	 *  8GB   - 100GB   one per 512MB
 	 *  100GB - 3TB     ~200
 	 *  3TB   - 2PB     one per 16GB
 	 *      > 2PB       ~131,072
 	 *  --------------------------------
 	 *
 	 *  Finally, note that all of the above calculate the initial
 	 *  number of metaslabs. Expanding a top-level vdev will result
 	 *  in additional metaslabs being allocated making it possible
 	 *  to exceed the zfs_vdev_ms_count_limit.
 	 */
 
 	if (ms_count < zfs_vdev_min_ms_count)
 		ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
 	else if (ms_count > zfs_vdev_default_ms_count)
 		ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
 	else
 		ms_shift = zfs_vdev_default_ms_shift;
 
 	if (ms_shift < SPA_MAXBLOCKSHIFT) {
 		ms_shift = SPA_MAXBLOCKSHIFT;
 	} else if (ms_shift > zfs_vdev_max_ms_shift) {
 		ms_shift = zfs_vdev_max_ms_shift;
 		/* cap the total count to constrain memory footprint */
 		if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
 			ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
 	}
 
 	vd->vdev_ms_shift = ms_shift;
 	ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
 }
 
 void
 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 {
 	ASSERT(vd == vd->vdev_top);
 	/* indirect vdevs don't have metaslabs or dtls */
 	ASSERT(vdev_is_concrete(vd) || flags == 0);
 	ASSERT(ISP2(flags));
 	ASSERT(spa_writeable(vd->vdev_spa));
 
 	if (flags & VDD_METASLAB)
 		(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
 
 	if (flags & VDD_DTL)
 		(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
 
 	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
 }
 
 void
 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		vdev_dirty(vd->vdev_top, flags, vd, txg);
 }
 
 /*
  * DTLs.
  *
  * A vdev's DTL (dirty time log) is the set of transaction groups for which
  * the vdev has less than perfect replication.  There are four kinds of DTL:
  *
  * DTL_MISSING: txgs for which the vdev has no valid copies of the data
  *
  * DTL_PARTIAL: txgs for which data is available, but not fully replicated
  *
  * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
  *	scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
  *	txgs that was scrubbed.
  *
  * DTL_OUTAGE: txgs which cannot currently be read, whether due to
  *	persistent errors or just some device being offline.
  *	Unlike the other three, the DTL_OUTAGE map is not generally
  *	maintained; it's only computed when needed, typically to
  *	determine whether a device can be detached.
  *
  * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
  * either has the data or it doesn't.
  *
  * For interior vdevs such as mirror and RAID-Z the picture is more complex.
  * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
  * if any child is less than fully replicated, then so is its parent.
  * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
  * comprising only those txgs which appear in 'maxfaults' or more children;
  * those are the txgs we don't have enough replication to read.  For example,
  * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
  * thus, its DTL_MISSING consists of the set of txgs that appear in more than
  * two child DTL_MISSING maps.
  *
  * It should be clear from the above that to compute the DTLs and outage maps
  * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
  * Therefore, that is all we keep on disk.  When loading the pool, or after
  * a configuration change, we generate all other DTLs from first principles.
  */
 void
 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
 	zfs_range_tree_t *rt = vd->vdev_dtl[t];
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 	ASSERT(spa_writeable(vd->vdev_spa));
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	if (!zfs_range_tree_contains(rt, txg, size))
 		zfs_range_tree_add(rt, txg, size);
 	mutex_exit(&vd->vdev_dtl_lock);
 }
 
 boolean_t
 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
 	zfs_range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t dirty = B_FALSE;
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
 	/*
 	 * While we are loading the pool, the DTLs have not been loaded yet.
 	 * This isn't a problem but it can result in devices being tried
 	 * which are known to not have the data.  In which case, the import
 	 * is relying on the checksum to ensure that we get the right data.
 	 * Note that while importing we are only reading the MOS, which is
 	 * always checksummed.
 	 */
 	mutex_enter(&vd->vdev_dtl_lock);
 	if (!zfs_range_tree_is_empty(rt))
 		dirty = zfs_range_tree_contains(rt, txg, size);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	return (dirty);
 }
 
 boolean_t
 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
 {
 	zfs_range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t empty;
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	empty = zfs_range_tree_is_empty(rt);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	return (empty);
 }
 
 /*
  * Check if the txg falls within the range which must be
  * resilvered.  DVAs outside this range can always be skipped.
  */
 boolean_t
 vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	(void) dva, (void) psize;
 
 	/* Set by sequential resilver. */
 	if (phys_birth == TXG_UNKNOWN)
 		return (B_TRUE);
 
 	return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
 }
 
 /*
  * Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
  */
 boolean_t
 vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
 	if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
 	    vd->vdev_ops->vdev_op_leaf)
 		return (B_TRUE);
 
 	return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
 	    phys_birth));
 }
 
 /*
  * Returns the lowest txg in the DTL range.
  */
 static uint64_t
 vdev_dtl_min(vdev_t *vd)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
 	return (zfs_range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1);
 }
 
 /*
  * Returns the highest txg in the DTL.
  */
 static uint64_t
 vdev_dtl_max(vdev_t *vd)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
 	return (zfs_range_tree_max(vd->vdev_dtl[DTL_MISSING]));
 }
 
 /*
  * Determine if a resilvering vdev should remove any DTL entries from
  * its range. If the vdev was resilvering for the entire duration of the
  * scan then it should excise that range from its DTLs. Otherwise, this
  * vdev is considered partially resilvered and should leave its DTL
  * entries intact. The comment in vdev_dtl_reassess() describes how we
  * excise the DTLs.
  */
 static boolean_t
 vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done)
 {
 	ASSERT0(vd->vdev_children);
 
 	if (vd->vdev_state < VDEV_STATE_DEGRADED)
 		return (B_FALSE);
 
 	if (vd->vdev_resilver_deferred)
 		return (B_FALSE);
 
 	if (zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
 		return (B_TRUE);
 
 	if (rebuild_done) {
 		vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
 		vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 
 		/* Rebuild not initiated by attach */
 		if (vd->vdev_rebuild_txg == 0)
 			return (B_TRUE);
 
 		/*
 		 * When a rebuild completes without error then all missing data
 		 * up to the rebuild max txg has been reconstructed and the DTL
 		 * is eligible for excision.
 		 */
 		if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE &&
 		    vdev_dtl_max(vd) <= vrp->vrp_max_txg) {
 			ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd));
 			ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg);
 			ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg);
 			return (B_TRUE);
 		}
 	} else {
 		dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
 		dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys;
 
 		/* Resilver not initiated by attach */
 		if (vd->vdev_resilver_txg == 0)
 			return (B_TRUE);
 
 		/*
 		 * When a resilver is initiated the scan will assign the
 		 * scn_max_txg value to the highest txg value that exists
 		 * in all DTLs. If this device's max DTL is not part of this
 		 * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg]
 		 * then it is not eligible for excision.
 		 */
 		if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
 			ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd));
 			ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg);
 			ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg);
 			return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Reassess DTLs after a config change or scrub completion. If txg == 0 no
  * write operations will be issued to the pool.
  */
 static void
 vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     boolean_t scrub_done, boolean_t rebuild_done, boolean_t faulting)
 {
 	spa_t *spa = vd->vdev_spa;
 	avl_tree_t reftree;
 	int minref;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_dtl_reassess_impl(vd->vdev_child[c], txg,
 		    scrub_txg, scrub_done, rebuild_done, faulting);
 
 	if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
 		return;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 		vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
 		boolean_t check_excise = B_FALSE;
 		boolean_t wasempty = B_TRUE;
 
 		mutex_enter(&vd->vdev_dtl_lock);
 
 		/*
 		 * If requested, pretend the scan or rebuild completed cleanly.
 		 */
 		if (zfs_scan_ignore_errors) {
 			if (scn != NULL)
 				scn->scn_phys.scn_errors = 0;
 			if (vr != NULL)
 				vr->vr_rebuild_phys.vrp_errors = 0;
 		}
 
 		if (scrub_txg != 0 &&
 		    !zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
 			wasempty = B_FALSE;
 			zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d "
 			    "dtl:%llu/%llu errors:%llu",
 			    (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg,
 			    (u_longlong_t)scrub_txg, spa->spa_scrub_started,
 			    (u_longlong_t)vdev_dtl_min(vd),
 			    (u_longlong_t)vdev_dtl_max(vd),
 			    (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0));
 		}
 
 		/*
 		 * If we've completed a scrub/resilver or a rebuild cleanly
 		 * then determine if this vdev should remove any DTLs. We
 		 * only want to excise regions on vdevs that were available
 		 * during the entire duration of this scan.
 		 */
 		if (rebuild_done &&
 		    vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) {
 			check_excise = B_TRUE;
 		} else {
 			if (spa->spa_scrub_started ||
 			    (scn != NULL && scn->scn_phys.scn_errors == 0)) {
 				check_excise = B_TRUE;
 			}
 		}
 
 		if (scrub_txg && check_excise &&
 		    vdev_dtl_should_excise(vd, rebuild_done)) {
 			/*
 			 * We completed a scrub, resilver or rebuild up to
 			 * scrub_txg.  If we did it without rebooting, then
 			 * the scrub dtl will be valid, so excise the old
 			 * region and fold in the scrub dtl.  Otherwise,
 			 * leave the dtl as-is if there was an error.
 			 *
 			 * There's little trick here: to excise the beginning
 			 * of the DTL_MISSING map, we put it into a reference
 			 * tree and then add a segment with refcnt -1 that
 			 * covers the range [0, scrub_txg).  This means
 			 * that each txg in that range has refcnt -1 or 0.
 			 * We then add DTL_SCRUB with a refcnt of 2, so that
 			 * entries in the range [0, scrub_txg) will have a
 			 * positive refcnt -- either 1 or 2.  We then convert
 			 * the reference tree into the new DTL_MISSING map.
 			 */
 			space_reftree_create(&reftree);
 			space_reftree_add_map(&reftree,
 			    vd->vdev_dtl[DTL_MISSING], 1);
 			space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
 			space_reftree_add_map(&reftree,
 			    vd->vdev_dtl[DTL_SCRUB], 2);
 			space_reftree_generate_map(&reftree,
 			    vd->vdev_dtl[DTL_MISSING], 1);
 			space_reftree_destroy(&reftree);
 
 			if (!zfs_range_tree_is_empty(
 			    vd->vdev_dtl[DTL_MISSING])) {
 				zfs_dbgmsg("update DTL_MISSING:%llu/%llu",
 				    (u_longlong_t)vdev_dtl_min(vd),
 				    (u_longlong_t)vdev_dtl_max(vd));
 			} else if (!wasempty) {
 				zfs_dbgmsg("DTL_MISSING is now empty");
 			}
 		}
 		zfs_range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
 		zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING],
 		    zfs_range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
 		if (scrub_done)
 			zfs_range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL,
 			    NULL);
 		zfs_range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
 
 		/*
 		 * For the faulting case, treat members of a replacing vdev
 		 * as if they are not available. It's more likely than not that
 		 * a vdev in a replacing vdev could encounter read errors so
 		 * treat it as not being able to contribute.
 		 */
 		if (!vdev_readable(vd) ||
 		    (faulting && vd->vdev_parent != NULL &&
 		    vd->vdev_parent->vdev_ops == &vdev_replacing_ops)) {
 			zfs_range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
 		} else {
 			zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING],
 			    zfs_range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
 		}
 
 		/*
 		 * If the vdev was resilvering or rebuilding and no longer
 		 * has any DTLs then reset the appropriate flag and dirty
 		 * the top level so that we persist the change.
 		 */
 		if (txg != 0 &&
 		    zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
 		    zfs_range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
 			if (vd->vdev_rebuild_txg != 0) {
 				vd->vdev_rebuild_txg = 0;
 				vdev_config_dirty(vd->vdev_top);
 			} else if (vd->vdev_resilver_txg != 0) {
 				vd->vdev_resilver_txg = 0;
 				vdev_config_dirty(vd->vdev_top);
 			}
 		}
 
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		if (txg != 0)
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
 	} else {
 		mutex_enter(&vd->vdev_dtl_lock);
 		for (int t = 0; t < DTL_TYPES; t++) {
 			/* account for child's outage in parent's missing map */
 			int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
 			if (t == DTL_SCRUB) {
 				/* leaf vdevs only */
 				continue;
 			}
 			if (t == DTL_PARTIAL) {
 				/* i.e. non-zero */
 				minref = 1;
 			} else if (vdev_get_nparity(vd) != 0) {
 				/* RAIDZ, DRAID */
 				minref = vdev_get_nparity(vd) + 1;
 			} else {
 				/* any kind of mirror */
 				minref = vd->vdev_children;
 			}
 			space_reftree_create(&reftree);
 			for (int c = 0; c < vd->vdev_children; c++) {
 				vdev_t *cvd = vd->vdev_child[c];
 				mutex_enter(&cvd->vdev_dtl_lock);
 				space_reftree_add_map(&reftree,
 				    cvd->vdev_dtl[s], 1);
 				mutex_exit(&cvd->vdev_dtl_lock);
 			}
 			space_reftree_generate_map(&reftree,
 			    vd->vdev_dtl[t], minref);
 			space_reftree_destroy(&reftree);
 		}
 		mutex_exit(&vd->vdev_dtl_lock);
 	}
 
 	if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) {
 		raidz_dtl_reassessed(vd);
 	}
 }
 
 void
 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     boolean_t scrub_done, boolean_t rebuild_done)
 {
 	return (vdev_dtl_reassess_impl(vd, txg, scrub_txg, scrub_done,
 	    rebuild_done, B_FALSE));
 }
 
 /*
  * Iterate over all the vdevs except spare, and post kobj events
  */
 void
 vdev_post_kobj_evt(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_kobj_evt_post &&
 	    vd->vdev_kobj_flag == B_FALSE) {
 		vd->vdev_kobj_flag = B_TRUE;
 		vd->vdev_ops->vdev_op_kobj_evt_post(vd);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_post_kobj_evt(vd->vdev_child[c]);
 }
 
 /*
  * Iterate over all the vdevs except spare, and clear kobj events
  */
 void
 vdev_clear_kobj_evt(vdev_t *vd)
 {
 	vd->vdev_kobj_flag = B_FALSE;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear_kobj_evt(vd->vdev_child[c]);
 }
 
 int
 vdev_dtl_load(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	zfs_range_tree_t *rt;
 	int error = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
 		ASSERT(vdev_is_concrete(vd));
 
 		/*
 		 * If the dtl cannot be sync'd there is no need to open it.
 		 */
 		if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps)
 			return (0);
 
 		error = space_map_open(&vd->vdev_dtl_sm, mos,
 		    vd->vdev_dtl_object, 0, -1ULL, 0);
 		if (error)
 			return (error);
 		ASSERT(vd->vdev_dtl_sm != NULL);
 
 		rt = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
 		error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC);
 		if (error == 0) {
 			mutex_enter(&vd->vdev_dtl_lock);
 			zfs_range_tree_walk(rt, zfs_range_tree_add,
 			    vd->vdev_dtl[DTL_MISSING]);
 			mutex_exit(&vd->vdev_dtl_lock);
 		}
 
 		zfs_range_tree_vacate(rt, NULL, NULL);
 		zfs_range_tree_destroy(rt);
 
 		return (error);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		error = vdev_dtl_load(vd->vdev_child[c]);
 		if (error != 0)
 			break;
 	}
 
 	return (error);
 }
 
 static void
 vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
 	const char *string;
 
 	ASSERT(alloc_bias != VDEV_BIAS_NONE);
 
 	string =
 	    (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
 	    (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
 	    (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
 
 	ASSERT(string != NULL);
 	VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
 	    1, strlen(string) + 1, string, tx));
 
 	if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
 		spa_activate_allocation_classes(spa, tx);
 	}
 }
 
 void
 vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
 	VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
 	    zapobj, tx));
 }
 
 uint64_t
 vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
 	    DMU_OT_NONE, 0, tx);
 
 	ASSERT(zap != 0);
 	VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
 	    zap, tx));
 
 	return (zap);
 }
 
 void
 vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
 {
 	if (vd->vdev_ops != &vdev_hole_ops &&
 	    vd->vdev_ops != &vdev_missing_ops &&
 	    vd->vdev_ops != &vdev_root_ops &&
 	    !vd->vdev_top->vdev_removing) {
 		if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
 			vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
 		}
 		if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
 			vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
 			if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
 				vdev_zap_allocation_data(vd, tx);
 		}
 	}
 	if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 &&
 	    spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) {
 		if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2))
 			spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx);
 		vd->vdev_root_zap = vdev_create_link_zap(vd, tx);
 	}
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		vdev_construct_zaps(vd->vdev_child[i], tx);
 	}
 }
 
 static void
 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	zfs_range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
 	objset_t *mos = spa->spa_meta_objset;
 	zfs_range_tree_t *rtsync;
 	dmu_tx_t *tx;
 	uint64_t object = space_map_object(vd->vdev_dtl_sm);
 
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		space_map_free(vd->vdev_dtl_sm, tx);
 		space_map_close(vd->vdev_dtl_sm);
 		vd->vdev_dtl_sm = NULL;
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		/*
 		 * We only destroy the leaf ZAP for detached leaves or for
 		 * removed log devices. Removed data devices handle leaf ZAP
 		 * cleanup later, once cancellation is no longer possible.
 		 */
 		if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
 		    vd->vdev_top->vdev_islog)) {
 			vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
 			vd->vdev_leaf_zap = 0;
 		}
 
 		dmu_tx_commit(tx);
 		return;
 	}
 
 	if (vd->vdev_dtl_sm == NULL) {
 		uint64_t new_object;
 
 		new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
 		    0, -1ULL, 0));
 		ASSERT(vd->vdev_dtl_sm != NULL);
 	}
 
 	rtsync = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	zfs_range_tree_walk(rt, zfs_range_tree_add, rtsync);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx);
 	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
 	zfs_range_tree_vacate(rtsync, NULL, NULL);
 
 	zfs_range_tree_destroy(rtsync);
 
 	/*
 	 * If the object for the space map has changed then dirty
 	 * the top level so that we update the config.
 	 */
 	if (object != space_map_object(vd->vdev_dtl_sm)) {
 		vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
 		    "new object %llu", (u_longlong_t)txg, spa_name(spa),
 		    (u_longlong_t)object,
 		    (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
 		vdev_config_dirty(vd->vdev_top);
 	}
 
 	dmu_tx_commit(tx);
 }
 
 /*
  * Determine whether the specified vdev can be
  * - offlined
  * - detached
  * - removed
  * - faulted
  * without losing data.
  */
 boolean_t
 vdev_dtl_required(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *tvd = vd->vdev_top;
 	uint8_t cant_read = vd->vdev_cant_read;
 	boolean_t required;
 	boolean_t faulting = vd->vdev_state == VDEV_STATE_FAULTED;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (vd == spa->spa_root_vdev || vd == tvd)
 		return (B_TRUE);
 
 	/*
 	 * Temporarily mark the device as unreadable, and then determine
 	 * whether this results in any DTL outages in the top-level vdev.
 	 * If not, we can safely offline/detach/remove the device.
 	 */
 	vd->vdev_cant_read = B_TRUE;
 	vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting);
 	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
 	vd->vdev_cant_read = cant_read;
 	vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting);
 
 	if (!required && zio_injection_enabled) {
 		required = !!zio_handle_device_injection(vd, NULL,
 		    SET_ERROR(ECHILD));
 	}
 
 	return (required);
 }
 
 /*
  * Determine if resilver is needed, and if so the txg range.
  */
 boolean_t
 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
 {
 	boolean_t needed = B_FALSE;
 	uint64_t thismin = UINT64_MAX;
 	uint64_t thismax = 0;
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		if (!zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
 		    vdev_writeable(vd)) {
 
 			thismin = vdev_dtl_min(vd);
 			thismax = vdev_dtl_max(vd);
 			needed = B_TRUE;
 		}
 		mutex_exit(&vd->vdev_dtl_lock);
 	} else {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			uint64_t cmin, cmax;
 
 			if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
 				thismin = MIN(thismin, cmin);
 				thismax = MAX(thismax, cmax);
 				needed = B_TRUE;
 			}
 		}
 	}
 
 	if (needed && minp) {
 		*minp = thismin;
 		*maxp = thismax;
 	}
 	return (needed);
 }
 
 /*
  * Gets the checkpoint space map object from the vdev's ZAP.  On success sm_obj
  * will contain either the checkpoint spacemap object or zero if none exists.
  * All other errors are returned to the caller.
  */
 int
 vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj)
 {
 	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 
 	if (vd->vdev_top_zap == 0) {
 		*sm_obj = 0;
 		return (0);
 	}
 
 	int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj);
 	if (error == ENOENT) {
 		*sm_obj = 0;
 		error = 0;
 	}
 
 	return (error);
 }
 
 int
 vdev_load(vdev_t *vd)
 {
 	int children = vd->vdev_children;
 	int error = 0;
 	taskq_t *tq = NULL;
 
 	/*
 	 * It's only worthwhile to use the taskq for the root vdev, because the
 	 * slow part is metaslab_init, and that only happens for top-level
 	 * vdevs.
 	 */
 	if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) {
 		tq = taskq_create("vdev_load", children, minclsyspri,
 		    children, children, TASKQ_PREPOPULATE);
 	}
 
 	/*
 	 * Recursively load all children.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (tq == NULL || vdev_uses_zvols(cvd)) {
 			cvd->vdev_load_error = vdev_load(cvd);
 		} else {
 			VERIFY(taskq_dispatch(tq, vdev_load_child,
 			    cvd, TQ_SLEEP) != TASKQID_INVALID);
 		}
 	}
 
 	if (tq != NULL) {
 		taskq_wait(tq);
 		taskq_destroy(tq);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		int error = vd->vdev_child[c]->vdev_load_error;
 
 		if (error != 0)
 			return (error);
 	}
 
 	vdev_set_deflate_ratio(vd);
 
 	if (vd->vdev_ops == &vdev_raidz_ops) {
 		error = vdev_raidz_load(vd);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * On spa_load path, grab the allocation bias from our zap
 	 */
 	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
 		spa_t *spa = vd->vdev_spa;
 		char bias_str[64];
 
 		error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
 		    bias_str);
 		if (error == 0) {
 			ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
 			vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
 		} else if (error != ENOENT) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) "
 			    "failed [error=%d]",
 			    (u_longlong_t)vd->vdev_top_zap, error);
 			return (error);
 		}
 	}
 
 	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
 		spa_t *spa = vd->vdev_spa;
 		uint64_t failfast;
 
 		error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
 		    vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast),
 		    1, &failfast);
 		if (error == 0) {
 			vd->vdev_failfast = failfast & 1;
 		} else if (error == ENOENT) {
 			vd->vdev_failfast = vdev_prop_default_numeric(
 			    VDEV_PROP_FAILFAST);
 		} else {
 			vdev_dbgmsg(vd,
 			    "vdev_load: zap_lookup(top_zap=%llu) "
 			    "failed [error=%d]",
 			    (u_longlong_t)vd->vdev_top_zap, error);
 		}
 	}
 
 	/*
 	 * Load any rebuild state from the top-level vdev zap.
 	 */
 	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
 		error = vdev_rebuild_load(vd);
 		if (error && error != ENOTSUP) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load "
 			    "failed [error=%d]", error);
 			return (error);
 		}
 	}
 
 	if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) {
 		uint64_t zapobj;
 
 		if (vd->vdev_top_zap != 0)
 			zapobj = vd->vdev_top_zap;
 		else
 			zapobj = vd->vdev_leaf_zap;
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N,
 		    &vd->vdev_checksum_n);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T,
 		    &vd->vdev_checksum_t);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_IO_N,
 		    &vd->vdev_io_n);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_IO_T,
 		    &vd->vdev_io_t);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N,
 		    &vd->vdev_slow_io_n);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T,
 		    &vd->vdev_slow_io_t);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 	}
 
 	/*
 	 * If this is a top-level vdev, initialize its metaslabs.
 	 */
 	if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
 		vdev_metaslab_group_create(vd);
 
 		if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
 			    "asize=%llu", (u_longlong_t)vd->vdev_ashift,
 			    (u_longlong_t)vd->vdev_asize);
 			return (SET_ERROR(ENXIO));
 		}
 
 		error = vdev_metaslab_init(vd, 0);
 		if (error != 0) {
 			vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
 			    "[error=%d]", error);
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			return (error);
 		}
 
 		uint64_t checkpoint_sm_obj;
 		error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj);
 		if (error == 0 && checkpoint_sm_obj != 0) {
 			objset_t *mos = spa_meta_objset(vd->vdev_spa);
 			ASSERT(vd->vdev_asize != 0);
 			ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
 
 			error = space_map_open(&vd->vdev_checkpoint_sm,
 			    mos, checkpoint_sm_obj, 0, vd->vdev_asize,
 			    vd->vdev_ashift);
 			if (error != 0) {
 				vdev_dbgmsg(vd, "vdev_load: space_map_open "
 				    "failed for checkpoint spacemap (obj %llu) "
 				    "[error=%d]",
 				    (u_longlong_t)checkpoint_sm_obj, error);
 				return (error);
 			}
 			ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 
 			/*
 			 * Since the checkpoint_sm contains free entries
 			 * exclusively we can use space_map_allocated() to
 			 * indicate the cumulative checkpointed space that
 			 * has been freed.
 			 */
 			vd->vdev_stat.vs_checkpoint_space =
 			    -space_map_allocated(vd->vdev_checkpoint_sm);
 			vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
 			    vd->vdev_stat.vs_checkpoint_space;
 		} else if (error != 0) {
 			vdev_dbgmsg(vd, "vdev_load: failed to retrieve "
 			    "checkpoint space map object from vdev ZAP "
 			    "[error=%d]", error);
 			return (error);
 		}
 	}
 
 	/*
 	 * If this is a leaf vdev, load its DTL.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
 		    "[error=%d]", error);
 		return (error);
 	}
 
 	uint64_t obsolete_sm_object;
 	error = vdev_obsolete_sm_object(vd, &obsolete_sm_object);
 	if (error == 0 && obsolete_sm_object != 0) {
 		objset_t *mos = vd->vdev_spa->spa_meta_objset;
 		ASSERT(vd->vdev_asize != 0);
 		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
 
 		if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
 		    obsolete_sm_object, 0, vd->vdev_asize, 0))) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
 			    "obsolete spacemap (obj %llu) [error=%d]",
 			    (u_longlong_t)obsolete_sm_object, error);
 			return (error);
 		}
 	} else if (error != 0) {
 		vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete "
 		    "space map object from vdev ZAP [error=%d]", error);
 		return (error);
 	}
 
 	return (0);
 }
 
 /*
  * The special vdev case is used for hot spares and l2cache devices.  Its
  * sole purpose it to set the vdev state for the associated vdev.  To do this,
  * we make sure that we can open the underlying device, then try to read the
  * label, and make sure that the label is sane and that it hasn't been
  * repurposed to another pool.
  */
 int
 vdev_validate_aux(vdev_t *vd)
 {
 	nvlist_t *label;
 	uint64_t guid, version;
 	uint64_t state;
 
 	if (!vdev_readable(vd))
 		return (0);
 
 	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		return (-1);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
 	    !SPA_VERSION_IS_SUPPORTED(version) ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
 	    guid != vd->vdev_guid ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		return (-1);
 	}
 
 	/*
 	 * We don't actually check the pool state here.  If it's in fact in
 	 * use by another pool, we update this fact on the fly when requested.
 	 */
 	nvlist_free(label);
 	return (0);
 }
 
 static void
 vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx)
 {
 	objset_t *mos = spa_meta_objset(vd->vdev_spa);
 
 	if (vd->vdev_top_zap == 0)
 		return;
 
 	uint64_t object = 0;
 	int err = zap_lookup(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object);
 	if (err == ENOENT)
 		return;
 	VERIFY0(err);
 
 	VERIFY0(dmu_object_free(mos, object, tx));
 	VERIFY0(zap_remove(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx));
 }
 
 /*
  * Free the objects used to store this vdev's spacemaps, and the array
  * that points to them.
  */
 void
 vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx)
 {
 	if (vd->vdev_ms_array == 0)
 		return;
 
 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
 	uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
 	size_t array_bytes = array_count * sizeof (uint64_t);
 	uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
 	VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
 	    array_bytes, smobj_array, 0));
 
 	for (uint64_t i = 0; i < array_count; i++) {
 		uint64_t smobj = smobj_array[i];
 		if (smobj == 0)
 			continue;
 
 		space_map_free_obj(mos, smobj, tx);
 	}
 
 	kmem_free(smobj_array, array_bytes);
 	VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
 	vdev_destroy_ms_flush_data(vd, tx);
 	vd->vdev_ms_array = 0;
 }
 
 static void
 vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(vd->vdev_islog);
 	ASSERT(vd == vd->vdev_top);
 	ASSERT3U(txg, ==, spa_syncing_txg(spa));
 
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	vdev_destroy_spacemaps(vd, tx);
 	if (vd->vdev_top_zap != 0) {
 		vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
 		vd->vdev_top_zap = 0;
 	}
 
 	dmu_tx_commit(tx);
 }
 
 void
 vdev_sync_done(vdev_t *vd, uint64_t txg)
 {
 	metaslab_t *msp;
 	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
 
 	ASSERT(vdev_is_concrete(vd));
 
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
 	    != NULL)
 		metaslab_sync_done(msp, txg);
 
 	if (reassess) {
 		metaslab_sync_reassess(vd->vdev_mg);
 		if (vd->vdev_log_mg != NULL)
 			metaslab_sync_reassess(vd->vdev_log_mg);
 	}
 }
 
 void
 vdev_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *lvd;
 	metaslab_t *msp;
 
 	ASSERT3U(txg, ==, spa->spa_syncing_txg);
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 	if (zfs_range_tree_space(vd->vdev_obsolete_segments) > 0) {
 		ASSERT(vd->vdev_removing ||
 		    vd->vdev_ops == &vdev_indirect_ops);
 
 		vdev_indirect_sync_obsolete(vd, tx);
 
 		/*
 		 * If the vdev is indirect, it can't have dirty
 		 * metaslabs or DTLs.
 		 */
 		if (vd->vdev_ops == &vdev_indirect_ops) {
 			ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
 			ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
 			dmu_tx_commit(tx);
 			return;
 		}
 	}
 
 	ASSERT(vdev_is_concrete(vd));
 
 	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
 	    !vd->vdev_removing) {
 		ASSERT(vd == vd->vdev_top);
 		ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
 		ASSERT(vd->vdev_ms_array != 0);
 		vdev_config_dirty(vd);
 	}
 
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
 		metaslab_sync(msp, txg);
 		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
 	}
 
 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
 		vdev_dtl_sync(lvd, txg);
 
 	/*
 	 * If this is an empty log device being removed, destroy the
 	 * metadata associated with it.
 	 */
 	if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
 		vdev_remove_empty_log(vd, txg);
 
 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
 	dmu_tx_commit(tx);
 }
 
 /*
  * Return the amount of space that should be (or was) allocated for the given
  * psize (compressed block size) in the given TXG. Note that for expanded
  * RAIDZ vdevs, the size allocated for older BP's may be larger. See
  * vdev_raidz_asize().
  */
 uint64_t
 vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
 	return (vd->vdev_ops->vdev_op_asize(vd, psize, txg));
 }
 
 uint64_t
 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
 {
 	return (vdev_psize_to_asize_txg(vd, psize, 0));
 }
 
 /*
  * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
  * not be opened, and no I/O is attempted.
  */
 int
 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd, *tvd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
 
 	tvd = vd->vdev_top;
 
 	/*
 	 * If user did a 'zpool offline -f' then make the fault persist across
 	 * reboots.
 	 */
 	if (aux == VDEV_AUX_EXTERNAL_PERSIST) {
 		/*
 		 * There are two kinds of forced faults: temporary and
 		 * persistent.  Temporary faults go away at pool import, while
 		 * persistent faults stay set.  Both types of faults can be
 		 * cleared with a zpool clear.
 		 *
 		 * We tell if a vdev is persistently faulted by looking at the
 		 * ZPOOL_CONFIG_AUX_STATE nvpair.  If it's set to "external" at
 		 * import then it's a persistent fault.  Otherwise, it's
 		 * temporary.  We get ZPOOL_CONFIG_AUX_STATE set to "external"
 		 * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL.  This
 		 * tells vdev_config_generate() (which gets run later) to set
 		 * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist.
 		 */
 		vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
 		vd->vdev_tmpoffline = B_FALSE;
 		aux = VDEV_AUX_EXTERNAL;
 	} else {
 		vd->vdev_tmpoffline = B_TRUE;
 	}
 
 	/*
 	 * We don't directly use the aux state here, but if we do a
 	 * vdev_reopen(), we need this value to be present to remember why we
 	 * were faulted.
 	 */
 	vd->vdev_label_aux = aux;
 
 	/*
 	 * Faulted state takes precedence over degraded.
 	 */
 	vd->vdev_delayed_close = B_FALSE;
 	vd->vdev_faulted = 1ULL;
 	vd->vdev_degraded = 0ULL;
 	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
 
 	/*
 	 * If this device has the only valid copy of the data, then
 	 * back off and simply mark the vdev as degraded instead.
 	 */
 	if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
 		vd->vdev_degraded = 1ULL;
 		vd->vdev_faulted = 0ULL;
 
 		/*
 		 * If we reopen the device and it's not dead, only then do we
 		 * mark it degraded.
 		 */
 		vdev_reopen(tvd);
 
 		if (vdev_readable(vd))
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
 	}
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 /*
  * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
  * user that something is wrong.  The vdev continues to operate as normal as far
  * as I/O is concerned.
  */
 int
 vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
 
 	/*
 	 * If the vdev is already faulted, then don't do anything.
 	 */
 	if (vd->vdev_faulted || vd->vdev_degraded)
 		return (spa_vdev_state_exit(spa, NULL, 0));
 
 	vd->vdev_degraded = 1ULL;
 	if (!vdev_is_dead(vd))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
 		    aux);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 int
 vdev_remove_wanted(spa_t *spa, uint64_t guid)
 {
 	vdev_t *vd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	/*
 	 * If the vdev is already removed, or expanding which can trigger
 	 * repartition add/remove events, then don't do anything.
 	 */
 	if (vd->vdev_removed || vd->vdev_expanding)
 		return (spa_vdev_state_exit(spa, NULL, 0));
 
 	/*
 	 * Confirm the vdev has been removed, otherwise don't do anything.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL)))
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST)));
 
 	vd->vdev_remove_wanted = B_TRUE;
 	spa_async_request(spa, SPA_ASYNC_REMOVE);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 
 /*
  * Online the given vdev.
  *
  * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things.  First, any attached
  * spare device should be detached when the device finishes resilvering.
  * Second, the online should be treated like a 'test' online case, so no FMA
  * events are generated if the device fails to open.
  */
 int
 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 {
 	vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
 	boolean_t wasoffline;
 	vdev_state_t oldstate;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
 	oldstate = vd->vdev_state;
 
 	tvd = vd->vdev_top;
 	vd->vdev_offline = B_FALSE;
 	vd->vdev_tmpoffline = B_FALSE;
 	vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
 	vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
 
 	/* XXX - L2ARC 1.0 does not support expansion */
 	if (!vd->vdev_aux) {
 		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 			pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) ||
 			    spa->spa_autoexpand);
 		vd->vdev_expansion_time = gethrestime_sec();
 	}
 
 	vdev_reopen(tvd);
 	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
 
 	if (!vd->vdev_aux) {
 		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 			pvd->vdev_expanding = B_FALSE;
 	}
 
 	if (newstate)
 		*newstate = vd->vdev_state;
 	if ((flags & ZFS_ONLINE_UNSPARE) &&
 	    !vdev_is_dead(vd) && vd->vdev_parent &&
 	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 
 	if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
 
 		/* XXX - L2ARC 1.0 does not support expansion */
 		if (vd->vdev_aux)
 			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
 		spa->spa_ccw_fail_time = 0;
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 	}
 
 	/* Restart initializing if necessary */
 	mutex_enter(&vd->vdev_initialize_lock);
 	if (vdev_writeable(vd) &&
 	    vd->vdev_initialize_thread == NULL &&
 	    vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
 		(void) vdev_initialize(vd);
 	}
 	mutex_exit(&vd->vdev_initialize_lock);
 
 	/*
 	 * Restart trimming if necessary. We do not restart trimming for cache
 	 * devices here. This is triggered by l2arc_rebuild_vdev()
 	 * asynchronously for the whole device or in l2arc_evict() as it evicts
 	 * space for upcoming writes.
 	 */
 	mutex_enter(&vd->vdev_trim_lock);
 	if (vdev_writeable(vd) && !vd->vdev_isl2cache &&
 	    vd->vdev_trim_thread == NULL &&
 	    vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
 		(void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
 		    vd->vdev_trim_secure);
 	}
 	mutex_exit(&vd->vdev_trim_lock);
 
 	if (wasoffline ||
 	    (oldstate < VDEV_STATE_DEGRADED &&
 	    vd->vdev_state >= VDEV_STATE_DEGRADED)) {
 		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
 
 		/*
 		 * Asynchronously detach spare vdev if resilver or
 		 * rebuild is not required
 		 */
 		if (vd->vdev_unspare &&
 		    !dsl_scan_resilvering(spa->spa_dsl_pool) &&
 		    !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) &&
 		    !vdev_rebuild_active(tvd))
 			spa_async_request(spa, SPA_ASYNC_DETACH_SPARE);
 	}
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 static int
 vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	vdev_t *vd, *tvd;
 	int error = 0;
 	uint64_t generation;
 	metaslab_group_t *mg;
 
 top:
 	spa_vdev_state_enter(spa, SCL_ALLOC);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
 
 	if (vd->vdev_ops == &vdev_draid_spare_ops)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	tvd = vd->vdev_top;
 	mg = tvd->vdev_mg;
 	generation = spa->spa_config_generation + 1;
 
 	/*
 	 * If the device isn't already offline, try to offline it.
 	 */
 	if (!vd->vdev_offline) {
 		/*
 		 * If this device has the only valid copy of some data,
 		 * don't allow it to be offlined. Log devices are always
 		 * expendable.
 		 */
 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
 		    vdev_dtl_required(vd))
 			return (spa_vdev_state_exit(spa, NULL,
 			    SET_ERROR(EBUSY)));
 
 		/*
 		 * If the top-level is a slog and it has had allocations
 		 * then proceed.  We check that the vdev's metaslab group
 		 * is not NULL since it's possible that we may have just
 		 * added this vdev but not yet initialized its metaslabs.
 		 */
 		if (tvd->vdev_islog && mg != NULL) {
 			/*
 			 * Prevent any future allocations.
 			 */
 			ASSERT3P(tvd->vdev_log_mg, ==, NULL);
 			metaslab_group_passivate(mg);
 			(void) spa_vdev_state_exit(spa, vd, 0);
 
 			error = spa_reset_logs(spa);
 
 			/*
 			 * If the log device was successfully reset but has
 			 * checkpointed data, do not offline it.
 			 */
 			if (error == 0 &&
 			    tvd->vdev_checkpoint_sm != NULL) {
 				ASSERT3U(space_map_allocated(
 				    tvd->vdev_checkpoint_sm), !=, 0);
 				error = ZFS_ERR_CHECKPOINT_EXISTS;
 			}
 
 			spa_vdev_state_enter(spa, SCL_ALLOC);
 
 			/*
 			 * Check to see if the config has changed.
 			 */
 			if (error || generation != spa->spa_config_generation) {
 				metaslab_group_activate(mg);
 				if (error)
 					return (spa_vdev_state_exit(spa,
 					    vd, error));
 				(void) spa_vdev_state_exit(spa, vd, 0);
 				goto top;
 			}
 			ASSERT0(tvd->vdev_stat.vs_alloc);
 		}
 
 		/*
 		 * Offline this device and reopen its top-level vdev.
 		 * If the top-level vdev is a log device then just offline
 		 * it. Otherwise, if this action results in the top-level
 		 * vdev becoming unusable, undo it and fail the request.
 		 */
 		vd->vdev_offline = B_TRUE;
 		vdev_reopen(tvd);
 
 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
 		    vdev_is_dead(tvd)) {
 			vd->vdev_offline = B_FALSE;
 			vdev_reopen(tvd);
 			return (spa_vdev_state_exit(spa, NULL,
 			    SET_ERROR(EBUSY)));
 		}
 
 		/*
 		 * Add the device back into the metaslab rotor so that
 		 * once we online the device it's open for business.
 		 */
 		if (tvd->vdev_islog && mg != NULL)
 			metaslab_group_activate(mg);
 	}
 
 	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 int
 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	int error;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 	error = vdev_offline_locked(spa, guid, flags);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * Clear the error counts associated with this vdev.  Unlike vdev_online() and
  * vdev_offline(), we assume the spa config is locked.  We also clear all
  * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
  */
 void
 vdev_clear(spa_t *spa, vdev_t *vd)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (vd == NULL)
 		vd = rvd;
 
 	vd->vdev_stat.vs_read_errors = 0;
 	vd->vdev_stat.vs_write_errors = 0;
 	vd->vdev_stat.vs_checksum_errors = 0;
 	vd->vdev_stat.vs_dio_verify_errors = 0;
 	vd->vdev_stat.vs_slow_ios = 0;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear(spa, vd->vdev_child[c]);
 
 	/*
 	 * It makes no sense to "clear" an indirect  or removed vdev.
 	 */
 	if (!vdev_is_concrete(vd) || vd->vdev_removed)
 		return;
 
 	/*
 	 * If we're in the FAULTED state or have experienced failed I/O, then
 	 * clear the persistent state and attempt to reopen the device.  We
 	 * also mark the vdev config dirty, so that the new faulted state is
 	 * written out to disk.
 	 */
 	if (vd->vdev_faulted || vd->vdev_degraded ||
 	    !vdev_readable(vd) || !vdev_writeable(vd)) {
 		/*
 		 * When reopening in response to a clear event, it may be due to
 		 * a fmadm repair request.  In this case, if the device is
 		 * still broken, we want to still post the ereport again.
 		 */
 		vd->vdev_forcefault = B_TRUE;
 
 		vd->vdev_faulted = vd->vdev_degraded = 0ULL;
 		vd->vdev_cant_read = B_FALSE;
 		vd->vdev_cant_write = B_FALSE;
 		vd->vdev_stat.vs_aux = 0;
 
 		vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
 
 		vd->vdev_forcefault = B_FALSE;
 
 		if (vd != rvd && vdev_writeable(vd->vdev_top))
 			vdev_state_dirty(vd->vdev_top);
 
 		/* If a resilver isn't required, check if vdevs can be culled */
 		if (vd->vdev_aux == NULL && !vdev_is_dead(vd) &&
 		    !dsl_scan_resilvering(spa->spa_dsl_pool) &&
 		    !dsl_scan_resilver_scheduled(spa->spa_dsl_pool))
 			spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 
 		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
 	}
 
 	/*
 	 * When clearing a FMA-diagnosed fault, we always want to
 	 * unspare the device, as we assume that the original spare was
 	 * done in response to the FMA fault.
 	 */
 	if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
 	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 
 	/* Clear recent error events cache (i.e. duplicate events tracking) */
 	zfs_ereport_clear(spa, vd);
 }
 
 boolean_t
 vdev_is_dead(vdev_t *vd)
 {
 	/*
 	 * Holes and missing devices are always considered "dead".
 	 * This simplifies the code since we don't have to check for
 	 * these types of devices in the various code paths.
 	 * Instead we rely on the fact that we skip over dead devices
 	 * before issuing I/O to them.
 	 */
 	return (vd->vdev_state < VDEV_STATE_DEGRADED ||
 	    vd->vdev_ops == &vdev_hole_ops ||
 	    vd->vdev_ops == &vdev_missing_ops);
 }
 
 boolean_t
 vdev_readable(vdev_t *vd)
 {
 	return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
 }
 
 boolean_t
 vdev_writeable(vdev_t *vd)
 {
 	return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
 	    vdev_is_concrete(vd));
 }
 
 boolean_t
 vdev_allocatable(vdev_t *vd)
 {
 	uint64_t state = vd->vdev_state;
 
 	/*
 	 * We currently allow allocations from vdevs which may be in the
 	 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
 	 * fails to reopen then we'll catch it later when we're holding
 	 * the proper locks.  Note that we have to get the vdev state
 	 * in a local variable because although it changes atomically,
 	 * we're asking two separate questions about it.
 	 */
 	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
 	    !vd->vdev_cant_write && vdev_is_concrete(vd) &&
 	    vd->vdev_mg->mg_initialized);
 }
 
 boolean_t
 vdev_accessible(vdev_t *vd, zio_t *zio)
 {
 	ASSERT(zio->io_vd == vd);
 
 	if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
 		return (B_FALSE);
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		return (!vd->vdev_cant_read);
 
 	if (zio->io_type == ZIO_TYPE_WRITE)
 		return (!vd->vdev_cant_write);
 
 	return (B_TRUE);
 }
 
 static void
 vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
 {
 	/*
 	 * Exclude the dRAID spare when aggregating to avoid double counting
 	 * the ops and bytes.  These IOs are counted by the physical leaves.
 	 */
 	if (cvd->vdev_ops == &vdev_draid_spare_ops)
 		return;
 
 	for (int t = 0; t < VS_ZIO_TYPES; t++) {
 		vs->vs_ops[t] += cvs->vs_ops[t];
 		vs->vs_bytes[t] += cvs->vs_bytes[t];
 	}
 
 	cvs->vs_scan_removing = cvd->vdev_removing;
 }
 
 /*
  * Get extended stats
  */
 static void
 vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
 {
 	(void) cvd;
 
 	int t, b;
 	for (t = 0; t < ZIO_TYPES; t++) {
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++)
 			vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b];
 
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) {
 			vsx->vsx_total_histo[t][b] +=
 			    cvsx->vsx_total_histo[t][b];
 		}
 	}
 
 	for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) {
 			vsx->vsx_queue_histo[t][b] +=
 			    cvsx->vsx_queue_histo[t][b];
 		}
 		vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t];
 		vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t];
 
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++)
 			vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b];
 
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++)
 			vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b];
 	}
 
 }
 
 boolean_t
 vdev_is_spacemap_addressable(vdev_t *vd)
 {
 	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
 		return (B_TRUE);
 
 	/*
 	 * If double-word space map entries are not enabled we assume
 	 * 47 bits of the space map entry are dedicated to the entry's
 	 * offset (see SM_OFFSET_BITS in space_map.h). We then use that
 	 * to calculate the maximum address that can be described by a
 	 * space map entry for the given device.
 	 */
 	uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
 
 	if (shift >= 63) /* detect potential overflow */
 		return (B_TRUE);
 
 	return (vd->vdev_asize < (1ULL << shift));
 }
 
 /*
  * Get statistics for the given vdev.
  */
 static void
 vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 {
 	int t;
 	/*
 	 * If we're getting stats on the root vdev, aggregate the I/O counts
 	 * over all top-level vdevs (i.e. the direct children of the root).
 	 */
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		if (vs) {
 			memset(vs->vs_ops, 0, sizeof (vs->vs_ops));
 			memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes));
 		}
 		if (vsx)
 			memset(vsx, 0, sizeof (*vsx));
 
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			vdev_stat_t *cvs = &cvd->vdev_stat;
 			vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex;
 
 			vdev_get_stats_ex_impl(cvd, cvs, cvsx);
 			if (vs)
 				vdev_get_child_stat(cvd, vs, cvs);
 			if (vsx)
 				vdev_get_child_stat_ex(cvd, vsx, cvsx);
 		}
 	} else {
 		/*
 		 * We're a leaf.  Just copy our ZIO active queue stats in.  The
 		 * other leaf stats are updated in vdev_stat_update().
 		 */
 		if (!vsx)
 			return;
 
 		memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
 
 		for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
 			vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
 			vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
 		}
 	}
 }
 
 void
 vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 {
 	vdev_t *tvd = vd->vdev_top;
 	mutex_enter(&vd->vdev_stat_lock);
 	if (vs) {
 		memcpy(vs, &vd->vdev_stat, sizeof (*vs));
 		vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 		vs->vs_state = vd->vdev_state;
 		vs->vs_rsize = vdev_get_min_asize(vd);
 
 		if (vd->vdev_ops->vdev_op_leaf) {
 			vs->vs_pspace = vd->vdev_psize;
 			vs->vs_rsize += VDEV_LABEL_START_SIZE +
 			    VDEV_LABEL_END_SIZE;
 			/*
 			 * Report initializing progress. Since we don't
 			 * have the initializing locks held, this is only
 			 * an estimate (although a fairly accurate one).
 			 */
 			vs->vs_initialize_bytes_done =
 			    vd->vdev_initialize_bytes_done;
 			vs->vs_initialize_bytes_est =
 			    vd->vdev_initialize_bytes_est;
 			vs->vs_initialize_state = vd->vdev_initialize_state;
 			vs->vs_initialize_action_time =
 			    vd->vdev_initialize_action_time;
 
 			/*
 			 * Report manual TRIM progress. Since we don't have
 			 * the manual TRIM locks held, this is only an
 			 * estimate (although fairly accurate one).
 			 */
 			vs->vs_trim_notsup = !vd->vdev_has_trim;
 			vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
 			vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
 			vs->vs_trim_state = vd->vdev_trim_state;
 			vs->vs_trim_action_time = vd->vdev_trim_action_time;
 
 			/* Set when there is a deferred resilver. */
 			vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
 		}
 
 		/*
 		 * Report expandable space on top-level, non-auxiliary devices
 		 * only. The expandable space is reported in terms of metaslab
 		 * sized units since that determines how much space the pool
 		 * can expand.
 		 */
 		if (vd->vdev_aux == NULL && tvd != NULL) {
 			vs->vs_esize = P2ALIGN_TYPED(
 			    vd->vdev_max_asize - vd->vdev_asize,
 			    1ULL << tvd->vdev_ms_shift, uint64_t);
 		}
 
 		vs->vs_configured_ashift = vd->vdev_top != NULL
 		    ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
 		vs->vs_logical_ashift = vd->vdev_logical_ashift;
 		if (vd->vdev_physical_ashift <= ASHIFT_MAX)
 			vs->vs_physical_ashift = vd->vdev_physical_ashift;
 		else
 			vs->vs_physical_ashift = 0;
 
 		/*
 		 * Report fragmentation and rebuild progress for top-level,
 		 * non-auxiliary, concrete devices.
 		 */
 		if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
 		    vdev_is_concrete(vd)) {
 			/*
 			 * The vdev fragmentation rating doesn't take into
 			 * account the embedded slog metaslab (vdev_log_mg).
 			 * Since it's only one metaslab, it would have a tiny
 			 * impact on the overall fragmentation.
 			 */
 			vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
 			    vd->vdev_mg->mg_fragmentation : 0;
 		}
 		vs->vs_noalloc = MAX(vd->vdev_noalloc,
 		    tvd ? tvd->vdev_noalloc : 0);
 	}
 
 	vdev_get_stats_ex_impl(vd, vs, vsx);
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 {
 	return (vdev_get_stats_ex(vd, vs, NULL));
 }
 
 void
 vdev_clear_stats(vdev_t *vd)
 {
 	mutex_enter(&vd->vdev_stat_lock);
 	vd->vdev_stat.vs_space = 0;
 	vd->vdev_stat.vs_dspace = 0;
 	vd->vdev_stat.vs_alloc = 0;
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_scan_stat_init(vdev_t *vd)
 {
 	vdev_stat_t *vs = &vd->vdev_stat;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_scan_stat_init(vd->vdev_child[c]);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	vs->vs_scan_processed = 0;
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_stat_update(zio_t *zio, uint64_t psize)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
 	vdev_t *pvd;
 	uint64_t txg = zio->io_txg;
 /* Suppress ASAN false positive */
 #ifdef __SANITIZE_ADDRESS__
 	vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL;
 	vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL;
 #else
 	vdev_stat_t *vs = &vd->vdev_stat;
 	vdev_stat_ex_t *vsx = &vd->vdev_stat_ex;
 #endif
 	zio_type_t type = zio->io_type;
 	int flags = zio->io_flags;
 
 	/*
 	 * If this i/o is a gang leader, it didn't do any actual work.
 	 */
 	if (zio->io_gang_tree)
 		return;
 
 	if (zio->io_error == 0) {
 		/*
 		 * If this is a root i/o, don't count it -- we've already
 		 * counted the top-level vdevs, and vdev_get_stats() will
 		 * aggregate them when asked.  This reduces contention on
 		 * the root vdev_stat_lock and implicitly handles blocks
 		 * that compress away to holes, for which there is no i/o.
 		 * (Holes never create vdev children, so all the counters
 		 * remain zero, which is what we want.)
 		 *
 		 * Note: this only applies to successful i/o (io_error == 0)
 		 * because unlike i/o counts, errors are not additive.
 		 * When reading a ditto block, for example, failure of
 		 * one top-level vdev does not imply a root-level error.
 		 */
 		if (vd == rvd)
 			return;
 
 		ASSERT(vd == zio->io_vd);
 
 		if (flags & ZIO_FLAG_IO_BYPASS)
 			return;
 
 		mutex_enter(&vd->vdev_stat_lock);
 
 		if (flags & ZIO_FLAG_IO_REPAIR) {
 			/*
 			 * Repair is the result of a resilver issued by the
 			 * scan thread (spa_sync).
 			 */
 			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 				dsl_scan_phys_t *scn_phys = &scn->scn_phys;
 				uint64_t *processed = &scn_phys->scn_processed;
 
 				if (vd->vdev_ops->vdev_op_leaf)
 					atomic_add_64(processed, psize);
 				vs->vs_scan_processed += psize;
 			}
 
 			/*
 			 * Repair is the result of a rebuild issued by the
 			 * rebuild thread (vdev_rebuild_thread).  To avoid
 			 * double counting repaired bytes the virtual dRAID
 			 * spare vdev is excluded from the processed bytes.
 			 */
 			if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
 				vdev_t *tvd = vd->vdev_top;
 				vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
 				vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 				uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
 
 				if (vd->vdev_ops->vdev_op_leaf &&
 				    vd->vdev_ops != &vdev_draid_spare_ops) {
 					atomic_add_64(rebuilt, psize);
 				}
 				vs->vs_rebuild_processed += psize;
 			}
 
 			if (flags & ZIO_FLAG_SELF_HEAL)
 				vs->vs_self_healed += psize;
 		}
 
 		/*
 		 * The bytes/ops/histograms are recorded at the leaf level and
 		 * aggregated into the higher level vdevs in vdev_get_stats().
 		 */
 		if (vd->vdev_ops->vdev_op_leaf &&
 		    (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
 			zio_type_t vs_type = type;
 			zio_priority_t priority = zio->io_priority;
 
 			/*
 			 * TRIM ops and bytes are reported to user space as
 			 * ZIO_TYPE_FLUSH.  This is done to preserve the
 			 * vdev_stat_t structure layout for user space.
 			 */
 			if (type == ZIO_TYPE_TRIM)
 				vs_type = ZIO_TYPE_FLUSH;
 
 			/*
 			 * Solely for the purposes of 'zpool iostat -lqrw'
 			 * reporting use the priority to categorize the IO.
 			 * Only the following are reported to user space:
 			 *
 			 *   ZIO_PRIORITY_SYNC_READ,
 			 *   ZIO_PRIORITY_SYNC_WRITE,
 			 *   ZIO_PRIORITY_ASYNC_READ,
 			 *   ZIO_PRIORITY_ASYNC_WRITE,
 			 *   ZIO_PRIORITY_SCRUB,
 			 *   ZIO_PRIORITY_TRIM,
 			 *   ZIO_PRIORITY_REBUILD.
 			 */
 			if (priority == ZIO_PRIORITY_INITIALIZING) {
 				ASSERT3U(type, ==, ZIO_TYPE_WRITE);
 				priority = ZIO_PRIORITY_ASYNC_WRITE;
 			} else if (priority == ZIO_PRIORITY_REMOVAL) {
 				priority = ((type == ZIO_TYPE_WRITE) ?
 				    ZIO_PRIORITY_ASYNC_WRITE :
 				    ZIO_PRIORITY_ASYNC_READ);
 			}
 
 			vs->vs_ops[vs_type]++;
 			vs->vs_bytes[vs_type] += psize;
 
 			if (flags & ZIO_FLAG_DELEGATED) {
 				vsx->vsx_agg_histo[priority]
 				    [RQ_HISTO(zio->io_size)]++;
 			} else {
 				vsx->vsx_ind_histo[priority]
 				    [RQ_HISTO(zio->io_size)]++;
 			}
 
 			if (zio->io_delta && zio->io_delay) {
 				vsx->vsx_queue_histo[priority]
 				    [L_HISTO(zio->io_delta - zio->io_delay)]++;
 				vsx->vsx_disk_histo[type]
 				    [L_HISTO(zio->io_delay)]++;
 				vsx->vsx_total_histo[type]
 				    [L_HISTO(zio->io_delta)]++;
 			}
 		}
 
 		mutex_exit(&vd->vdev_stat_lock);
 		return;
 	}
 
 	if (flags & ZIO_FLAG_SPECULATIVE)
 		return;
 
 	/*
 	 * If this is an I/O error that is going to be retried, then ignore the
 	 * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
 	 * hard errors, when in reality they can happen for any number of
 	 * innocuous reasons (bus resets, MPxIO link failure, etc).
 	 */
 	if (zio->io_error == EIO &&
 	    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
 		return;
 
 	/*
 	 * Intent logs writes won't propagate their error to the root
 	 * I/O so don't mark these types of failures as pool-level
 	 * errors.
 	 */
 	if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		return;
 
 	if (type == ZIO_TYPE_WRITE && txg != 0 &&
 	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
 	    (flags & ZIO_FLAG_SCAN_THREAD) ||
 	    spa->spa_claiming)) {
 		/*
 		 * This is either a normal write (not a repair), or it's
 		 * a repair induced by the scrub thread, or it's a repair
 		 * made by zil_claim() during spa_load() in the first txg.
 		 * In the normal case, we commit the DTL change in the same
 		 * txg as the block was born.  In the scrub-induced repair
 		 * case, we know that scrubs run in first-pass syncing context,
 		 * so we commit the DTL change in spa_syncing_txg(spa).
 		 * In the zil_claim() case, we commit in spa_first_txg(spa).
 		 *
 		 * We currently do not make DTL entries for failed spontaneous
 		 * self-healing writes triggered by normal (non-scrubbing)
 		 * reads, because we have no transactional context in which to
 		 * do so -- and it's not clear that it'd be desirable anyway.
 		 */
 		if (vd->vdev_ops->vdev_op_leaf) {
 			uint64_t commit_txg = txg;
 			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				ASSERT(spa_sync_pass(spa) == 1);
 				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
 				commit_txg = spa_syncing_txg(spa);
 			} else if (spa->spa_claiming) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				commit_txg = spa_first_txg(spa);
 			}
 			ASSERT(commit_txg >= spa_syncing_txg(spa));
 			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
 				return;
 			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
 		}
 		if (vd != rvd)
 			vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
 	}
 }
 
 int64_t
 vdev_deflated_space(vdev_t *vd, int64_t space)
 {
 	ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
 	ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
 
 	return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
 }
 
 /*
  * Update the in-core space usage stats for this vdev, its metaslab class,
  * and the root vdev.
  */
 void
 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
     int64_t space_delta)
 {
 	(void) defer_delta;
 	int64_t dspace_delta;
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
 	 * factor.  We must calculate this here and not at the root vdev
 	 * because the root vdev's psize-to-asize is simply the max of its
 	 * children's, thus not accurate enough for us.
 	 */
 	dspace_delta = vdev_deflated_space(vd, space_delta);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	/* ensure we won't underflow */
 	if (alloc_delta < 0) {
 		ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta);
 	}
 
 	vd->vdev_stat.vs_alloc += alloc_delta;
 	vd->vdev_stat.vs_space += space_delta;
 	vd->vdev_stat.vs_dspace += dspace_delta;
 	mutex_exit(&vd->vdev_stat_lock);
 
 	/* every class but log contributes to root space stats */
 	if (vd->vdev_mg != NULL && !vd->vdev_islog) {
 		ASSERT(!vd->vdev_isl2cache);
 		mutex_enter(&rvd->vdev_stat_lock);
 		rvd->vdev_stat.vs_alloc += alloc_delta;
 		rvd->vdev_stat.vs_space += space_delta;
 		rvd->vdev_stat.vs_dspace += dspace_delta;
 		mutex_exit(&rvd->vdev_stat_lock);
 	}
 	/* Note: metaslab_class_space_update moved to metaslab_space_update */
 }
 
 /*
  * Mark a top-level vdev's config as dirty, placing it on the dirty list
  * so that it will be written out next time the vdev configuration is synced.
  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
  */
 void
 vdev_config_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	int c;
 
 	ASSERT(spa_writeable(spa));
 
 	/*
 	 * If this is an aux vdev (as with l2cache and spare devices), then we
 	 * update the vdev config manually and set the sync flag.
 	 */
 	if (vd->vdev_aux != NULL) {
 		spa_aux_vdev_t *sav = vd->vdev_aux;
 		nvlist_t **aux;
 		uint_t naux;
 
 		for (c = 0; c < sav->sav_count; c++) {
 			if (sav->sav_vdevs[c] == vd)
 				break;
 		}
 
 		if (c == sav->sav_count) {
 			/*
 			 * We're being removed.  There's nothing more to do.
 			 */
 			ASSERT(sav->sav_sync == B_TRUE);
 			return;
 		}
 
 		sav->sav_sync = B_TRUE;
 
 		if (nvlist_lookup_nvlist_array(sav->sav_config,
 		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
 			VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
 			    ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
 		}
 
 		ASSERT(c < naux);
 
 		/*
 		 * Setting the nvlist in the middle if the array is a little
 		 * sketchy, but it will work.
 		 */
 		nvlist_free(aux[c]);
 		aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
 
 		return;
 	}
 
 	/*
 	 * The dirty list is protected by the SCL_CONFIG lock.  The caller
 	 * must either hold SCL_CONFIG as writer, or must be the sync thread
 	 * (which holds SCL_CONFIG as reader).  There's only one sync thread,
 	 * so this is sufficient to ensure mutual exclusion.
 	 */
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
 
 	if (vd == rvd) {
 		for (c = 0; c < rvd->vdev_children; c++)
 			vdev_config_dirty(rvd->vdev_child[c]);
 	} else {
 		ASSERT(vd == vd->vdev_top);
 
 		if (!list_link_active(&vd->vdev_config_dirty_node) &&
 		    vdev_is_concrete(vd)) {
 			list_insert_head(&spa->spa_config_dirty_list, vd);
 		}
 	}
 }
 
 void
 vdev_config_clean(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
 
 	ASSERT(list_link_active(&vd->vdev_config_dirty_node));
 	list_remove(&spa->spa_config_dirty_list, vd);
 }
 
 /*
  * Mark a top-level vdev's state as dirty, so that the next pass of
  * spa_sync() can convert this into vdev_config_dirty().  We distinguish
  * the state changes from larger config changes because they require
  * much less locking, and are often needed for administrative actions.
  */
 void
 vdev_state_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_writeable(spa));
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * The state list is protected by the SCL_STATE lock.  The caller
 	 * must either hold SCL_STATE as writer, or must be the sync thread
 	 * (which holds SCL_STATE as reader).  There's only one sync thread,
 	 * so this is sufficient to ensure mutual exclusion.
 	 */
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
 	if (!list_link_active(&vd->vdev_state_dirty_node) &&
 	    vdev_is_concrete(vd))
 		list_insert_head(&spa->spa_state_dirty_list, vd);
 }
 
 void
 vdev_state_clean(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
 	ASSERT(list_link_active(&vd->vdev_state_dirty_node));
 	list_remove(&spa->spa_state_dirty_list, vd);
 }
 
 /*
  * Propagate vdev state up from children to parent.
  */
 void
 vdev_propagate_state(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	int degraded = 0, faulted = 0;
 	int corrupted = 0;
 	vdev_t *child;
 
 	if (vd->vdev_children > 0) {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			child = vd->vdev_child[c];
 
 			/*
 			 * Don't factor holes or indirect vdevs into the
 			 * decision.
 			 */
 			if (!vdev_is_concrete(child))
 				continue;
 
 			if (!vdev_readable(child) ||
 			    (!vdev_writeable(child) && spa_writeable(spa))) {
 				/*
 				 * Root special: if there is a top-level log
 				 * device, treat the root vdev as if it were
 				 * degraded.
 				 */
 				if (child->vdev_islog && vd == rvd)
 					degraded++;
 				else
 					faulted++;
 			} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
 				degraded++;
 			}
 
 			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
 				corrupted++;
 		}
 
 		vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
 
 		/*
 		 * Root special: if there is a top-level vdev that cannot be
 		 * opened due to corrupted metadata, then propagate the root
 		 * vdev's aux state as 'corrupt' rather than 'insufficient
 		 * replicas'.
 		 */
 		if (corrupted && vd == rvd &&
 		    rvd->vdev_state == VDEV_STATE_CANT_OPEN)
 			vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 	}
 
 	if (vd->vdev_parent)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
 /*
  * Set a vdev's state.  If this is during an open, we don't update the parent
  * state, because we're in the process of opening children depth-first.
  * Otherwise, we propagate the change to the parent.
  *
  * If this routine places a device in a faulted state, an appropriate ereport is
  * generated.
  */
 void
 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 {
 	uint64_t save_state;
 	spa_t *spa = vd->vdev_spa;
 
 	if (state == vd->vdev_state) {
 		/*
 		 * Since vdev_offline() code path is already in an offline
 		 * state we can miss a statechange event to OFFLINE. Check
 		 * the previous state to catch this condition.
 		 */
 		if (vd->vdev_ops->vdev_op_leaf &&
 		    (state == VDEV_STATE_OFFLINE) &&
 		    (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) {
 			/* post an offline state change */
 			zfs_post_state_change(spa, vd, vd->vdev_prevstate);
 		}
 		vd->vdev_stat.vs_aux = aux;
 		return;
 	}
 
 	save_state = vd->vdev_state;
 
 	vd->vdev_state = state;
 	vd->vdev_stat.vs_aux = aux;
 
 	/*
 	 * If we are setting the vdev state to anything but an open state, then
 	 * always close the underlying device unless the device has requested
 	 * a delayed close (i.e. we're about to remove or fault the device).
 	 * Otherwise, we keep accessible but invalid devices open forever.
 	 * We don't call vdev_close() itself, because that implies some extra
 	 * checks (offline, etc) that we don't want here.  This is limited to
 	 * leaf devices, because otherwise closing the device will affect other
 	 * children.
 	 */
 	if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
 	    vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_close(vd);
 
 	if (vd->vdev_removed &&
 	    state == VDEV_STATE_CANT_OPEN &&
 	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
 		/*
 		 * If the previous state is set to VDEV_STATE_REMOVED, then this
 		 * device was previously marked removed and someone attempted to
 		 * reopen it.  If this failed due to a nonexistent device, then
 		 * keep the device in the REMOVED state.  We also let this be if
 		 * it is one of our special test online cases, which is only
 		 * attempting to online the device and shouldn't generate an FMA
 		 * fault.
 		 */
 		vd->vdev_state = VDEV_STATE_REMOVED;
 		vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	} else if (state == VDEV_STATE_REMOVED) {
 		vd->vdev_removed = B_TRUE;
 	} else if (state == VDEV_STATE_CANT_OPEN) {
 		/*
 		 * If we fail to open a vdev during an import or recovery, we
 		 * mark it as "not available", which signifies that it was
 		 * never there to begin with.  Failure to open such a device
 		 * is not considered an error.
 		 */
 		if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
 		    spa_load_state(spa) == SPA_LOAD_RECOVER) &&
 		    vd->vdev_ops->vdev_op_leaf)
 			vd->vdev_not_present = 1;
 
 		/*
 		 * Post the appropriate ereport.  If the 'prevstate' field is
 		 * set to something other than VDEV_STATE_UNKNOWN, it indicates
 		 * that this is part of a vdev_reopen().  In this case, we don't
 		 * want to post the ereport if the device was already in the
 		 * CANT_OPEN state beforehand.
 		 *
 		 * If the 'checkremove' flag is set, then this is an attempt to
 		 * online the device in response to an insertion event.  If we
 		 * hit this case, then we have detected an insertion event for a
 		 * faulted or offline device that wasn't in the removed state.
 		 * In this scenario, we don't post an ereport because we are
 		 * about to replace the device, or attempt an online with
 		 * vdev_forcefault, which will generate the fault for us.
 		 */
 		if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
 		    !vd->vdev_not_present && !vd->vdev_checkremove &&
 		    vd != spa->spa_root_vdev) {
 			const char *class;
 
 			switch (aux) {
 			case VDEV_AUX_OPEN_FAILED:
 				class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
 				break;
 			case VDEV_AUX_CORRUPT_DATA:
 				class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
 				break;
 			case VDEV_AUX_NO_REPLICAS:
 				class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
 				break;
 			case VDEV_AUX_BAD_GUID_SUM:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
 				break;
 			case VDEV_AUX_TOO_SMALL:
 				class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
 				break;
 			case VDEV_AUX_BAD_LABEL:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
 				break;
 			case VDEV_AUX_BAD_ASHIFT:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT;
 				break;
 			default:
 				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
 			}
 
 			(void) zfs_ereport_post(class, spa, vd, NULL, NULL,
 			    save_state);
 		}
 
 		/* Erase any notion of persistent removed state */
 		vd->vdev_removed = B_FALSE;
 	} else {
 		vd->vdev_removed = B_FALSE;
 	}
 
 	/*
 	 * Notify ZED of any significant state-change on a leaf vdev.
 	 *
 	 */
 	if (vd->vdev_ops->vdev_op_leaf) {
 		/* preserve original state from a vdev_reopen() */
 		if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) &&
 		    (vd->vdev_prevstate != vd->vdev_state) &&
 		    (save_state <= VDEV_STATE_CLOSED))
 			save_state = vd->vdev_prevstate;
 
 		/* filter out state change due to initial vdev_open */
 		if (save_state > VDEV_STATE_CLOSED)
 			zfs_post_state_change(spa, vd, save_state);
 	}
 
 	if (!isopen && vd->vdev_parent)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
 boolean_t
 vdev_children_are_offline(vdev_t *vd)
 {
 	ASSERT(!vd->vdev_ops->vdev_op_leaf);
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
 			return (B_FALSE);
 	}
 
 	return (B_TRUE);
 }
 
 /*
  * Check the vdev configuration to ensure that it's capable of supporting
  * a root pool. We do not support partial configuration.
  */
 boolean_t
 vdev_is_bootable(vdev_t *vd)
 {
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		const char *vdev_type = vd->vdev_ops->vdev_op_type;
 
 		if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0)
 			return (B_FALSE);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (!vdev_is_bootable(vd->vdev_child[c]))
 			return (B_FALSE);
 	}
 	return (B_TRUE);
 }
 
 boolean_t
 vdev_is_concrete(vdev_t *vd)
 {
 	vdev_ops_t *ops = vd->vdev_ops;
 	if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops ||
 	    ops == &vdev_missing_ops || ops == &vdev_root_ops) {
 		return (B_FALSE);
 	} else {
 		return (B_TRUE);
 	}
 }
 
 /*
  * Determine if a log device has valid content.  If the vdev was
  * removed or faulted in the MOS config then we know that
  * the content on the log device has already been written to the pool.
  */
 boolean_t
 vdev_log_state_valid(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
 	    !vd->vdev_removed)
 		return (B_TRUE);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_log_state_valid(vd->vdev_child[c]))
 			return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 /*
  * Expand a vdev if possible.
  */
 void
 vdev_expand(vdev_t *vd, uint64_t txg)
 {
 	ASSERT(vd->vdev_top == vd);
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(vdev_is_concrete(vd));
 
 	vdev_set_deflate_ratio(vd);
 
 	if ((vd->vdev_spa->spa_raidz_expand == NULL ||
 	    vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) &&
 	    (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
 	    vdev_is_concrete(vd)) {
 		vdev_metaslab_group_create(vd);
 		VERIFY(vdev_metaslab_init(vd, txg) == 0);
 		vdev_config_dirty(vd);
 	}
 }
 
 /*
  * Split a vdev.
  */
 void
 vdev_split(vdev_t *vd)
 {
 	vdev_t *cvd, *pvd = vd->vdev_parent;
 
 	VERIFY3U(pvd->vdev_children, >, 1);
 
 	vdev_remove_child(pvd, vd);
 	vdev_compact_children(pvd);
 
 	ASSERT3P(pvd->vdev_child, !=, NULL);
 
 	cvd = pvd->vdev_child[0];
 	if (pvd->vdev_children == 1) {
 		vdev_remove_parent(cvd);
 		cvd->vdev_splitting = B_TRUE;
 	}
 	vdev_propagate_state(cvd);
 }
 
 void
 vdev_deadman(vdev_t *vd, const char *tag)
 {
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		vdev_deadman(cvd, tag);
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		vdev_queue_t *vq = &vd->vdev_queue;
 
 		mutex_enter(&vq->vq_lock);
 		if (vq->vq_active > 0) {
 			spa_t *spa = vd->vdev_spa;
 			zio_t *fio;
 			uint64_t delta;
 
 			zfs_dbgmsg("slow vdev: %s has %u active IOs",
 			    vd->vdev_path, vq->vq_active);
 
 			/*
 			 * Look at the head of all the pending queues,
 			 * if any I/O has been outstanding for longer than
 			 * the spa_deadman_synctime invoke the deadman logic.
 			 */
 			fio = list_head(&vq->vq_active_list);
 			delta = gethrtime() - fio->io_timestamp;
 			if (delta > spa_deadman_synctime(spa))
 				zio_deadman(fio, tag);
 		}
 		mutex_exit(&vq->vq_lock);
 	}
 }
 
 void
 vdev_defer_resilver(vdev_t *vd)
 {
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	vd->vdev_resilver_deferred = B_TRUE;
 	vd->vdev_spa->spa_resilver_deferred = B_TRUE;
 }
 
 /*
  * Clears the resilver deferred flag on all leaf devs under vd. Returns
  * B_TRUE if we have devices that need to be resilvered and are available to
  * accept resilver I/Os.
  */
 boolean_t
 vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
 {
 	boolean_t resilver_needed = B_FALSE;
 	spa_t *spa = vd->vdev_spa;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 		resilver_needed |= vdev_clear_resilver_deferred(cvd, tx);
 	}
 
 	if (vd == spa->spa_root_vdev &&
 	    spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
 		spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
 		vdev_config_dirty(vd);
 		spa->spa_resilver_deferred = B_FALSE;
 		return (resilver_needed);
 	}
 
 	if (!vdev_is_concrete(vd) || vd->vdev_aux ||
 	    !vd->vdev_ops->vdev_op_leaf)
 		return (resilver_needed);
 
 	vd->vdev_resilver_deferred = B_FALSE;
 
 	return (!vdev_is_dead(vd) && !vd->vdev_offline &&
 	    vdev_resilver_needed(vd, NULL, NULL));
 }
 
 boolean_t
-vdev_xlate_is_empty(range_seg64_t *rs)
+vdev_xlate_is_empty(zfs_range_seg64_t *rs)
 {
 	return (rs->rs_start == rs->rs_end);
 }
 
 /*
  * Translate a logical range to the first contiguous physical range for the
  * specified vdev_t.  This function is initially called with a leaf vdev and
  * will walk each parent vdev until it reaches a top-level vdev. Once the
  * top-level is reached the physical range is initialized and the recursive
  * function begins to unwind. As it unwinds it calls the parent's vdev
  * specific translation function to do the real conversion.
  */
 void
-vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
-    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
+vdev_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
+    zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
 {
 	/*
 	 * Walk up the vdev tree
 	 */
 	if (vd != vd->vdev_top) {
 		vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
 		    remain_rs);
 	} else {
 		/*
 		 * We've reached the top-level vdev, initialize the physical
 		 * range to the logical range and set an empty remaining
 		 * range then start to unwind.
 		 */
 		physical_rs->rs_start = logical_rs->rs_start;
 		physical_rs->rs_end = logical_rs->rs_end;
 
 		remain_rs->rs_start = logical_rs->rs_start;
 		remain_rs->rs_end = logical_rs->rs_start;
 
 		return;
 	}
 
 	vdev_t *pvd = vd->vdev_parent;
 	ASSERT3P(pvd, !=, NULL);
 	ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
 
 	/*
 	 * As this recursive function unwinds, translate the logical
 	 * range into its physical and any remaining components by calling
 	 * the vdev specific translate function.
 	 */
-	range_seg64_t intermediate = { 0 };
+	zfs_range_seg64_t intermediate = { 0 };
 	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
 
 	physical_rs->rs_start = intermediate.rs_start;
 	physical_rs->rs_end = intermediate.rs_end;
 }
 
 void
-vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
+vdev_xlate_walk(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
     vdev_xlate_func_t *func, void *arg)
 {
-	range_seg64_t iter_rs = *logical_rs;
-	range_seg64_t physical_rs;
-	range_seg64_t remain_rs;
+	zfs_range_seg64_t iter_rs = *logical_rs;
+	zfs_range_seg64_t physical_rs;
+	zfs_range_seg64_t remain_rs;
 
 	while (!vdev_xlate_is_empty(&iter_rs)) {
 
 		vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
 
 		/*
 		 * With raidz and dRAID, it's possible that the logical range
 		 * does not live on this leaf vdev. Only when there is a non-
 		 * zero physical size call the provided function.
 		 */
 		if (!vdev_xlate_is_empty(&physical_rs))
 			func(arg, &physical_rs);
 
 		iter_rs = remain_rs;
 	}
 }
 
 static char *
 vdev_name(vdev_t *vd, char *buf, int buflen)
 {
 	if (vd->vdev_path == NULL) {
 		if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) {
 			strlcpy(buf, vd->vdev_spa->spa_name, buflen);
 		} else if (!vd->vdev_ops->vdev_op_leaf) {
 			snprintf(buf, buflen, "%s-%llu",
 			    vd->vdev_ops->vdev_op_type,
 			    (u_longlong_t)vd->vdev_id);
 		}
 	} else {
 		strlcpy(buf, vd->vdev_path, buflen);
 	}
 	return (buf);
 }
 
 /*
  * Look at the vdev tree and determine whether any devices are currently being
  * replaced.
  */
 boolean_t
 vdev_replace_in_progress(vdev_t *vdev)
 {
 	ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0);
 
 	if (vdev->vdev_ops == &vdev_replacing_ops)
 		return (B_TRUE);
 
 	/*
 	 * A 'spare' vdev indicates that we have a replace in progress, unless
 	 * it has exactly two children, and the second, the hot spare, has
 	 * finished being resilvered.
 	 */
 	if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 ||
 	    !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING)))
 		return (B_TRUE);
 
 	for (int i = 0; i < vdev->vdev_children; i++) {
 		if (vdev_replace_in_progress(vdev->vdev_child[i]))
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Add a (source=src, propname=propval) list to an nvlist.
  */
 static void
 vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval,
     uint64_t intval, zprop_source_t src)
 {
 	nvlist_t *propval;
 
 	propval = fnvlist_alloc();
 	fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
 
 	if (strval != NULL)
 		fnvlist_add_string(propval, ZPROP_VALUE, strval);
 	else
 		fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
 
 	fnvlist_add_nvlist(nvl, propname, propval);
 	nvlist_free(propval);
 }
 
 static void
 vdev_props_set_sync(void *arg, dmu_tx_t *tx)
 {
 	vdev_t *vd;
 	nvlist_t *nvp = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	nvpair_t *elem = NULL;
 	uint64_t vdev_guid;
 	uint64_t objid;
 	nvlist_t *nvprops;
 
 	vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV);
 	nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS);
 	vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
 
 	/* this vdev could get removed while waiting for this sync task */
 	if (vd == NULL)
 		return;
 
 	/*
 	 * Set vdev property values in the vdev props mos object.
 	 */
 	if (vd->vdev_root_zap != 0) {
 		objid = vd->vdev_root_zap;
 	} else if (vd->vdev_top_zap != 0) {
 		objid = vd->vdev_top_zap;
 	} else if (vd->vdev_leaf_zap != 0) {
 		objid = vd->vdev_leaf_zap;
 	} else {
 		panic("unexpected vdev type");
 	}
 
 	mutex_enter(&spa->spa_props_lock);
 
 	while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
 		uint64_t intval;
 		const char *strval;
 		vdev_prop_t prop;
 		const char *propname = nvpair_name(elem);
 		zprop_type_t proptype;
 
 		switch (prop = vdev_name_to_prop(propname)) {
 		case VDEV_PROP_USERPROP:
 			if (vdev_prop_user(propname)) {
 				strval = fnvpair_value_string(elem);
 				if (strlen(strval) == 0) {
 					/* remove the property if value == "" */
 					(void) zap_remove(mos, objid, propname,
 					    tx);
 				} else {
 					VERIFY0(zap_update(mos, objid, propname,
 					    1, strlen(strval) + 1, strval, tx));
 				}
 				spa_history_log_internal(spa, "vdev set", tx,
 				    "vdev_guid=%llu: %s=%s",
 				    (u_longlong_t)vdev_guid, nvpair_name(elem),
 				    strval);
 			}
 			break;
 		default:
 			/* normalize the property name */
 			propname = vdev_prop_to_name(prop);
 			proptype = vdev_prop_get_type(prop);
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				ASSERT(proptype == PROP_TYPE_STRING);
 				strval = fnvpair_value_string(elem);
 				VERIFY0(zap_update(mos, objid, propname,
 				    1, strlen(strval) + 1, strval, tx));
 				spa_history_log_internal(spa, "vdev set", tx,
 				    "vdev_guid=%llu: %s=%s",
 				    (u_longlong_t)vdev_guid, nvpair_name(elem),
 				    strval);
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
 				intval = fnvpair_value_uint64(elem);
 
 				if (proptype == PROP_TYPE_INDEX) {
 					const char *unused;
 					VERIFY0(vdev_prop_index_to_string(
 					    prop, intval, &unused));
 				}
 				VERIFY0(zap_update(mos, objid, propname,
 				    sizeof (uint64_t), 1, &intval, tx));
 				spa_history_log_internal(spa, "vdev set", tx,
 				    "vdev_guid=%llu: %s=%lld",
 				    (u_longlong_t)vdev_guid,
 				    nvpair_name(elem), (longlong_t)intval);
 			} else {
 				panic("invalid vdev property type %u",
 				    nvpair_type(elem));
 			}
 		}
 
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 }
 
 int
 vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvpair_t *elem = NULL;
 	uint64_t vdev_guid;
 	nvlist_t *nvprops;
 	int error = 0;
 
 	ASSERT(vd != NULL);
 
 	/* Check that vdev has a zap we can use */
 	if (vd->vdev_root_zap == 0 &&
 	    vd->vdev_top_zap == 0 &&
 	    vd->vdev_leaf_zap == 0)
 		return (SET_ERROR(EINVAL));
 
 	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
 	    &vdev_guid) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS,
 	    &nvprops) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL)
 		return (SET_ERROR(EINVAL));
 
 	while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
 		const char *propname = nvpair_name(elem);
 		vdev_prop_t prop = vdev_name_to_prop(propname);
 		uint64_t intval = 0;
 		const char *strval = NULL;
 
 		if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) {
 			error = EINVAL;
 			goto end;
 		}
 
 		if (prop != VDEV_PROP_USERPROP && vdev_prop_readonly(prop)) {
 			error = EROFS;
 			goto end;
 		}
 
 		/* Special Processing */
 		switch (prop) {
 		case VDEV_PROP_PATH:
 			if (vd->vdev_path == NULL) {
 				error = EROFS;
 				break;
 			}
 			if (nvpair_value_string(elem, &strval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			/* New path must start with /dev/ */
 			if (strncmp(strval, "/dev/", 5)) {
 				error = EINVAL;
 				break;
 			}
 			error = spa_vdev_setpath(spa, vdev_guid, strval);
 			break;
 		case VDEV_PROP_ALLOCATING:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			if (intval != vd->vdev_noalloc)
 				break;
 			if (intval == 0)
 				error = spa_vdev_noalloc(spa, vdev_guid);
 			else
 				error = spa_vdev_alloc(spa, vdev_guid);
 			break;
 		case VDEV_PROP_FAILFAST:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_failfast = intval & 1;
 			break;
 		case VDEV_PROP_CHECKSUM_N:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_checksum_n = intval;
 			break;
 		case VDEV_PROP_CHECKSUM_T:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_checksum_t = intval;
 			break;
 		case VDEV_PROP_IO_N:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_io_n = intval;
 			break;
 		case VDEV_PROP_IO_T:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_io_t = intval;
 			break;
 		case VDEV_PROP_SLOW_IO_N:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_slow_io_n = intval;
 			break;
 		case VDEV_PROP_SLOW_IO_T:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_slow_io_t = intval;
 			break;
 		default:
 			/* Most processing is done in vdev_props_set_sync */
 			break;
 		}
 end:
 		if (error != 0) {
 			intval = error;
 			vdev_prop_add_list(outnvl, propname, strval, intval, 0);
 			return (error);
 		}
 	}
 
 	return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync,
 	    innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 int
 vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	int err = 0;
 	uint64_t objid;
 	uint64_t vdev_guid;
 	nvpair_t *elem = NULL;
 	nvlist_t *nvprops = NULL;
 	uint64_t intval = 0;
 	char *strval = NULL;
 	const char *propname = NULL;
 	vdev_prop_t prop;
 
 	ASSERT(vd != NULL);
 	ASSERT(mos != NULL);
 
 	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV,
 	    &vdev_guid) != 0)
 		return (SET_ERROR(EINVAL));
 
 	nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops);
 
 	if (vd->vdev_root_zap != 0) {
 		objid = vd->vdev_root_zap;
 	} else if (vd->vdev_top_zap != 0) {
 		objid = vd->vdev_top_zap;
 	} else if (vd->vdev_leaf_zap != 0) {
 		objid = vd->vdev_leaf_zap;
 	} else {
 		return (SET_ERROR(EINVAL));
 	}
 	ASSERT(objid != 0);
 
 	mutex_enter(&spa->spa_props_lock);
 
 	if (nvprops != NULL) {
 		char namebuf[64] = { 0 };
 
 		while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
 			intval = 0;
 			strval = NULL;
 			propname = nvpair_name(elem);
 			prop = vdev_name_to_prop(propname);
 			zprop_source_t src = ZPROP_SRC_DEFAULT;
 			uint64_t integer_size, num_integers;
 
 			switch (prop) {
 			/* Special Read-only Properties */
 			case VDEV_PROP_NAME:
 				strval = vdev_name(vd, namebuf,
 				    sizeof (namebuf));
 				if (strval == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname, strval, 0,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_CAPACITY:
 				/* percent used */
 				intval = (vd->vdev_stat.vs_dspace == 0) ? 0 :
 				    (vd->vdev_stat.vs_alloc * 100 /
 				    vd->vdev_stat.vs_dspace);
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    intval, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_STATE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_state, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_GUID:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_guid, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_ASIZE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_asize, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PSIZE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_psize, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_ASHIFT:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_ashift, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_SIZE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_FREE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_dspace -
 				    vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_ALLOCATED:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_EXPANDSZ:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_esize, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_FRAGMENTATION:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_fragmentation,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PARITY:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vdev_get_nparity(vd), ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PATH:
 				if (vd->vdev_path == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_path, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_DEVID:
 				if (vd->vdev_devid == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_devid, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PHYS_PATH:
 				if (vd->vdev_physpath == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_physpath, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_ENC_PATH:
 				if (vd->vdev_enc_sysfs_path == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_FRU:
 				if (vd->vdev_fru == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_fru, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PARENT:
 				if (vd->vdev_parent != NULL) {
 					strval = vdev_name(vd->vdev_parent,
 					    namebuf, sizeof (namebuf));
 					vdev_prop_add_list(outnvl, propname,
 					    strval, 0, ZPROP_SRC_NONE);
 				}
 				continue;
 			case VDEV_PROP_CHILDREN:
 				if (vd->vdev_children > 0)
 					strval = kmem_zalloc(ZAP_MAXVALUELEN,
 					    KM_SLEEP);
 				for (uint64_t i = 0; i < vd->vdev_children;
 				    i++) {
 					const char *vname;
 
 					vname = vdev_name(vd->vdev_child[i],
 					    namebuf, sizeof (namebuf));
 					if (vname == NULL)
 						vname = "(unknown)";
 					if (strlen(strval) > 0)
 						strlcat(strval, ",",
 						    ZAP_MAXVALUELEN);
 					strlcat(strval, vname, ZAP_MAXVALUELEN);
 				}
 				if (strval != NULL) {
 					vdev_prop_add_list(outnvl, propname,
 					    strval, 0, ZPROP_SRC_NONE);
 					kmem_free(strval, ZAP_MAXVALUELEN);
 				}
 				continue;
 			case VDEV_PROP_NUMCHILDREN:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_children, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_READ_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_read_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_WRITE_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_write_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_CHECKSUM_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_checksum_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_INITIALIZE_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_initialize_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_TRIM_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_trim_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_SLOW_IOS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_slow_ios,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_NULL:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_NULL],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_READ:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_READ],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_WRITE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_FREE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_FREE],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_CLAIM:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_TRIM:
 				/*
 				 * TRIM ops and bytes are reported to user
 				 * space as ZIO_TYPE_FLUSH.  This is done to
 				 * preserve the vdev_stat_t structure layout
 				 * for user space.
 				 */
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_NULL:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_READ:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_READ],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_WRITE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_FREE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_CLAIM:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_TRIM:
 				/*
 				 * TRIM ops and bytes are reported to user
 				 * space as ZIO_TYPE_FLUSH.  This is done to
 				 * preserve the vdev_stat_t structure layout
 				 * for user space.
 				 */
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_REMOVING:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_removing, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_RAIDZ_EXPANDING:
 				/* Only expose this for raidz */
 				if (vd->vdev_ops == &vdev_raidz_ops) {
 					vdev_prop_add_list(outnvl, propname,
 					    NULL, vd->vdev_rz_expanding,
 					    ZPROP_SRC_NONE);
 				}
 				continue;
 			case VDEV_PROP_TRIM_SUPPORT:
 				/* only valid for leaf vdevs */
 				if (vd->vdev_ops->vdev_op_leaf) {
 					vdev_prop_add_list(outnvl, propname,
 					    NULL, vd->vdev_has_trim,
 					    ZPROP_SRC_NONE);
 				}
 				continue;
 			/* Numeric Properites */
 			case VDEV_PROP_ALLOCATING:
 				/* Leaf vdevs cannot have this property */
 				if (vd->vdev_mg == NULL &&
 				    vd->vdev_top != NULL) {
 					src = ZPROP_SRC_NONE;
 					intval = ZPROP_BOOLEAN_NA;
 				} else {
 					err = vdev_prop_get_int(vd, prop,
 					    &intval);
 					if (err && err != ENOENT)
 						break;
 
 					if (intval ==
 					    vdev_prop_default_numeric(prop))
 						src = ZPROP_SRC_DEFAULT;
 					else
 						src = ZPROP_SRC_LOCAL;
 				}
 
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    intval, src);
 				break;
 			case VDEV_PROP_FAILFAST:
 				src = ZPROP_SRC_LOCAL;
 				strval = NULL;
 
 				err = zap_lookup(mos, objid, nvpair_name(elem),
 				    sizeof (uint64_t), 1, &intval);
 				if (err == ENOENT) {
 					intval = vdev_prop_default_numeric(
 					    prop);
 					err = 0;
 				} else if (err) {
 					break;
 				}
 				if (intval == vdev_prop_default_numeric(prop))
 					src = ZPROP_SRC_DEFAULT;
 
 				vdev_prop_add_list(outnvl, propname, strval,
 				    intval, src);
 				break;
 			case VDEV_PROP_CHECKSUM_N:
 			case VDEV_PROP_CHECKSUM_T:
 			case VDEV_PROP_IO_N:
 			case VDEV_PROP_IO_T:
 			case VDEV_PROP_SLOW_IO_N:
 			case VDEV_PROP_SLOW_IO_T:
 				err = vdev_prop_get_int(vd, prop, &intval);
 				if (err && err != ENOENT)
 					break;
 
 				if (intval == vdev_prop_default_numeric(prop))
 					src = ZPROP_SRC_DEFAULT;
 				else
 					src = ZPROP_SRC_LOCAL;
 
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    intval, src);
 				break;
 			/* Text Properties */
 			case VDEV_PROP_COMMENT:
 				/* Exists in the ZAP below */
 				/* FALLTHRU */
 			case VDEV_PROP_USERPROP:
 				/* User Properites */
 				src = ZPROP_SRC_LOCAL;
 
 				err = zap_length(mos, objid, nvpair_name(elem),
 				    &integer_size, &num_integers);
 				if (err)
 					break;
 
 				switch (integer_size) {
 				case 8:
 					/* User properties cannot be integers */
 					err = EINVAL;
 					break;
 				case 1:
 					/* string property */
 					strval = kmem_alloc(num_integers,
 					    KM_SLEEP);
 					err = zap_lookup(mos, objid,
 					    nvpair_name(elem), 1,
 					    num_integers, strval);
 					if (err) {
 						kmem_free(strval,
 						    num_integers);
 						break;
 					}
 					vdev_prop_add_list(outnvl, propname,
 					    strval, 0, src);
 					kmem_free(strval, num_integers);
 					break;
 				}
 				break;
 			default:
 				err = ENOENT;
 				break;
 			}
 			if (err)
 				break;
 		}
 	} else {
 		/*
 		 * Get all properties from the MOS vdev property object.
 		 */
 		zap_cursor_t zc;
 		zap_attribute_t *za = zap_attribute_alloc();
 		for (zap_cursor_init(&zc, mos, objid);
 		    (err = zap_cursor_retrieve(&zc, za)) == 0;
 		    zap_cursor_advance(&zc)) {
 			intval = 0;
 			strval = NULL;
 			zprop_source_t src = ZPROP_SRC_DEFAULT;
 			propname = za->za_name;
 
 			switch (za->za_integer_length) {
 			case 8:
 				/* We do not allow integer user properties */
 				/* This is likely an internal value */
 				break;
 			case 1:
 				/* string property */
 				strval = kmem_alloc(za->za_num_integers,
 				    KM_SLEEP);
 				err = zap_lookup(mos, objid, za->za_name, 1,
 				    za->za_num_integers, strval);
 				if (err) {
 					kmem_free(strval, za->za_num_integers);
 					break;
 				}
 				vdev_prop_add_list(outnvl, propname, strval, 0,
 				    src);
 				kmem_free(strval, za->za_num_integers);
 				break;
 
 			default:
 				break;
 			}
 		}
 		zap_cursor_fini(&zc);
 		zap_attribute_free(za);
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 	if (err && err != ENOENT) {
 		return (err);
 	}
 
 	return (0);
 }
 
 EXPORT_SYMBOL(vdev_fault);
 EXPORT_SYMBOL(vdev_degrade);
 EXPORT_SYMBOL(vdev_online);
 EXPORT_SYMBOL(vdev_offline);
 EXPORT_SYMBOL(vdev_clear);
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW,
 	"Target number of metaslabs per top-level vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW,
 	"Default lower limit for metaslab size");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW,
 	"Default upper limit for metaslab size");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW,
 	"Minimum number of metaslabs per top-level vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW,
 	"Practical upper limit of total metaslabs per top-level vdev");
 
 ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
 	"Rate limit slow IO (delay) events to this many per second");
 
 ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW,
 	"Rate limit hung IO (deadman) events to this many per second");
 
 ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW,
 	"Rate Direct I/O write verify events to this many per second");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW,
 	"Direct I/O writes will perform for checksum verification before "
 	"commiting write");
 
 ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
 	"Rate limit checksum events to this many checksum errors per second "
 	"(do not set below ZED threshold).");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW,
 	"Ignore errors during resilver/scrub");
 
 ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
 	"Bypass vdev_validate()");
 
 ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
 	"Disable cache flushes");
 
 ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW,
 	"Minimum number of metaslabs required to dedicate one for log blocks");
 
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
 	param_set_min_auto_ashift, param_get_uint, ZMOD_RW,
 	"Minimum ashift used when creating new top-level vdevs");
 
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift,
 	param_set_max_auto_ashift, param_get_uint, ZMOD_RW,
 	"Maximum ashift used when optimizing for logical -> physical sector "
 	"size on new top-level vdevs");
 
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, raidz_impl,
 		param_set_raidz_impl, param_get_raidz_impl, ZMOD_RW,
 		"RAIDZ implementation");
diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c
index 419c8ac5bb28..45f8bcfbd4ed 100644
--- a/module/zfs/vdev_draid.c
+++ b/module/zfs/vdev_draid.c
@@ -1,2821 +1,2821 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2018 Intel Corporation.
  * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_rebuild.h>
 #include <sys/abd.h>
 #include <sys/zio.h>
 #include <sys/nvpair.h>
 #include <sys/zio_checksum.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 #include <zfs_fletcher.h>
 
 #ifdef ZFS_DEBUG
 #include <sys/vdev.h>	/* For vdev_xlate() in vdev_draid_io_verify() */
 #endif
 
 /*
  * dRAID is a distributed spare implementation for ZFS. A dRAID vdev is
  * comprised of multiple raidz redundancy groups which are spread over the
  * dRAID children. To ensure an even distribution, and avoid hot spots, a
  * permutation mapping is applied to the order of the dRAID children.
  * This mixing effectively distributes the parity columns evenly over all
  * of the disks in the dRAID.
  *
  * This is beneficial because it means when resilvering all of the disks
  * can participate thereby increasing the available IOPs and bandwidth.
  * Furthermore, by reserving a small fraction of each child's total capacity
  * virtual distributed spare disks can be created. These spares similarly
  * benefit from the performance gains of spanning all of the children. The
  * consequence of which is that resilvering to a distributed spare can
  * substantially reduce the time required to restore full parity to pool
  * with a failed disks.
  *
  * === dRAID group layout ===
  *
  * First, let's define a "row" in the configuration to be a 16M chunk from
  * each physical drive at the same offset. This is the minimum allowable
  * size since it must be possible to store a full 16M block when there is
  * only a single data column. Next, we define a "group" to be a set of
  * sequential disks containing both the parity and data columns. We allow
  * groups to span multiple rows in order to align any group size to any
  * number of physical drives. Finally, a "slice" is comprised of the rows
  * which contain the target number of groups. The permutation mappings
  * are applied in a round robin fashion to each slice.
  *
  * Given D+P drives in a group (including parity drives) and C-S physical
  * drives (not including the spare drives), we can distribute the groups
  * across R rows without remainder by selecting the least common multiple
  * of D+P and C-S as the number of groups; i.e. ngroups = LCM(D+P, C-S).
  *
  * In the example below, there are C=14 physical drives in the configuration
  * with S=2 drives worth of spare capacity. Each group has a width of 9
  * which includes D=8 data and P=1 parity drive. There are 4 groups and
  * 3 rows per slice.  Each group has a size of 144M (16M * 9) and a slice
  * size is 576M (144M * 4). When allocating from a dRAID each group is
  * filled before moving on to the next as show in slice0 below.
  *
  *             data disks (8 data + 1 parity)          spares (2)
  *     +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *  ^  | 2 | 6 | 1 | 11| 4 | 0 | 7 | 10| 8 | 9 | 13| 5 | 12| 3 | device map 0
  *  |  +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *  |  |              group 0              |  group 1..|       |
  *  |  +-----------------------------------+-----------+-------|
  *  |  | 0   1   2   3   4   5   6   7   8 | 36  37  38|       |  r
  *  |  | 9   10  11  12  13  14  15  16  17| 45  46  47|       |  o
  *  |  | 18  19  20  21  22  23  24  25  26| 54  55  56|       |  w
  *     | 27  28  29  30  31  32  33  34  35| 63  64  65|       |  0
  *  s  +-----------------------+-----------------------+-------+
  *  l  |       ..group 1       |        group 2..      |       |
  *  i  +-----------------------+-----------------------+-------+
  *  c  | 39  40  41  42  43  44| 72  73  74  75  76  77|       |  r
  *  e  | 48  49  50  51  52  53| 81  82  83  84  85  86|       |  o
  *  0  | 57  58  59  60  61  62| 90  91  92  93  94  95|       |  w
  *     | 66  67  68  69  70  71| 99 100 101 102 103 104|       |  1
  *  |  +-----------+-----------+-----------------------+-------+
  *  |  |..group 2  |            group 3                |       |
  *  |  +-----------+-----------+-----------------------+-------+
  *  |  | 78  79  80|108 109 110 111 112 113 114 115 116|       |  r
  *  |  | 87  88  89|117 118 119 120 121 122 123 124 125|       |  o
  *  |  | 96  97  98|126 127 128 129 130 131 132 133 134|       |  w
  *  v  |105 106 107|135 136 137 138 139 140 141 142 143|       |  2
  *     +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *     | 9 | 11| 12| 2 | 4 | 1 | 3 | 0 | 10| 13| 8 | 5 | 6 | 7 | device map 1
  *  s  +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *  l  |              group 4              |  group 5..|       | row 3
  *  i  +-----------------------+-----------+-----------+-------|
  *  c  |       ..group 5       |        group 6..      |       | row 4
  *  e  +-----------+-----------+-----------------------+-------+
  *  1  |..group 6  |            group 7                |       | row 5
  *     +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *     | 3 | 5 | 10| 8 | 6 | 11| 12| 0 | 2 | 4 | 7 | 1 | 9 | 13| device map 2
  *  s  +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *  l  |              group 8              |  group 9..|       | row 6
  *  i  +-----------------------------------------------+-------|
  *  c  |       ..group 9       |        group 10..     |       | row 7
  *  e  +-----------------------+-----------------------+-------+
  *  2  |..group 10 |            group 11               |       | row 8
  *     +-----------+-----------------------------------+-------+
  *
  * This layout has several advantages over requiring that each row contain
  * a whole number of groups.
  *
  * 1. The group count is not a relevant parameter when defining a dRAID
  *    layout. Only the group width is needed, and *all* groups will have
  *    the desired size.
  *
  * 2. All possible group widths (<= physical disk count) can be supported.
  *
  * 3. The logic within vdev_draid.c is simplified when the group width is
  *    the same for all groups (although some of the logic around computing
  *    permutation numbers and drive offsets is more complicated).
  *
  * N.B. The following array describes all valid dRAID permutation maps.
  * Each row is used to generate a permutation map for a different number
  * of children from a unique seed. The seeds were generated and carefully
  * evaluated by the 'draid' utility in order to provide balanced mappings.
  * In addition to the seed a checksum of the in-memory mapping is stored
  * for verification.
  *
  * The imbalance ratio of a given failure (e.g. 5 disks wide, child 3 failed,
  * with a given permutation map) is the ratio of the amounts of I/O that will
  * be sent to the least and most busy disks when resilvering. The average
  * imbalance ratio (of a given number of disks and permutation map) is the
  * average of the ratios of all possible single and double disk failures.
  *
  * In order to achieve a low imbalance ratio the number of permutations in
  * the mapping must be significantly larger than the number of children.
  * For dRAID the number of permutations has been limited to 512 to minimize
  * the map size. This does result in a gradually increasing imbalance ratio
  * as seen in the table below. Increasing the number of permutations for
  * larger child counts would reduce the imbalance ratio. However, in practice
  * when there are a large number of children each child is responsible for
  * fewer total IOs so it's less of a concern.
  *
  * Note these values are hard coded and must never be changed.  Existing
  * pools depend on the same mapping always being generated in order to
  * read and write from the correct locations.  Any change would make
  * existing pools completely inaccessible.
  */
 static const draid_map_t draid_maps[VDEV_DRAID_MAX_MAPS] = {
 	{   2, 256, 0x89ef3dabbcc7de37, 0x00000000433d433d },	/* 1.000 */
 	{   3, 256, 0x89a57f3de98121b4, 0x00000000bcd8b7b5 },	/* 1.000 */
 	{   4, 256, 0xc9ea9ec82340c885, 0x00000001819d7c69 },	/* 1.000 */
 	{   5, 256, 0xf46733b7f4d47dfd, 0x00000002a1648d74 },	/* 1.010 */
 	{   6, 256, 0x88c3c62d8585b362, 0x00000003d3b0c2c4 },	/* 1.031 */
 	{   7, 256, 0x3a65d809b4d1b9d5, 0x000000055c4183ee },	/* 1.043 */
 	{   8, 256, 0xe98930e3c5d2e90a, 0x00000006edfb0329 },	/* 1.059 */
 	{   9, 256, 0x5a5430036b982ccb, 0x00000008ceaf6934 },	/* 1.056 */
 	{  10, 256, 0x92bf389e9eadac74, 0x0000000b26668c09 },	/* 1.072 */
 	{  11, 256, 0x74ccebf1dcf3ae80, 0x0000000dd691358c },	/* 1.083 */
 	{  12, 256, 0x8847e41a1a9f5671, 0x00000010a0c63c8e },	/* 1.097 */
 	{  13, 256, 0x7481b56debf0e637, 0x0000001424121fe4 },	/* 1.100 */
 	{  14, 256, 0x559b8c44065f8967, 0x00000016ab2ff079 },	/* 1.121 */
 	{  15, 256, 0x34c49545a2ee7f01, 0x0000001a6028efd6 },	/* 1.103 */
 	{  16, 256, 0xb85f4fa81a7698f7, 0x0000001e95ff5e66 },	/* 1.111 */
 	{  17, 256, 0x6353e47b7e47aba0, 0x00000021a81fa0fe },	/* 1.133 */
 	{  18, 256, 0xaa549746b1cbb81c, 0x00000026f02494c9 },	/* 1.131 */
 	{  19, 256, 0x892e343f2f31d690, 0x00000029eb392835 },	/* 1.130 */
 	{  20, 256, 0x76914824db98cc3f, 0x0000003004f31a7c },	/* 1.141 */
 	{  21, 256, 0x4b3cbabf9cfb1d0f, 0x00000036363a2408 },	/* 1.139 */
 	{  22, 256, 0xf45c77abb4f035d4, 0x00000038dd0f3e84 },	/* 1.150 */
 	{  23, 256, 0x5e18bd7f3fd4baf4, 0x0000003f0660391f },	/* 1.174 */
 	{  24, 256, 0xa7b3a4d285d6503b, 0x000000443dfc9ff6 },	/* 1.168 */
 	{  25, 256, 0x56ac7dd967521f5a, 0x0000004b03a87eb7 },	/* 1.180 */
 	{  26, 256, 0x3a42dfda4eb880f7, 0x000000522c719bba },	/* 1.226 */
 	{  27, 256, 0xd200d2fc6b54bf60, 0x0000005760b4fdf5 },	/* 1.228 */
 	{  28, 256, 0xc52605bbd486c546, 0x0000005e00d8f74c },	/* 1.217 */
 	{  29, 256, 0xc761779e63cd762f, 0x00000067be3cd85c },	/* 1.239 */
 	{  30, 256, 0xca577b1e07f85ca5, 0x0000006f5517f3e4 },	/* 1.238 */
 	{  31, 256, 0xfd50a593c518b3d4, 0x0000007370e7778f },	/* 1.273 */
 	{  32, 512, 0xc6c87ba5b042650b, 0x000000f7eb08a156 },	/* 1.191 */
 	{  33, 512, 0xc3880d0c9d458304, 0x0000010734b5d160 },	/* 1.199 */
 	{  34, 512, 0xe920927e4d8b2c97, 0x00000118c1edbce0 },	/* 1.195 */
 	{  35, 512, 0x8da7fcda87bde316, 0x0000012a3e9f9110 },	/* 1.201 */
 	{  36, 512, 0xcf09937491514a29, 0x0000013bd6a24bef },	/* 1.194 */
 	{  37, 512, 0x9b5abbf345cbd7cc, 0x0000014b9d90fac3 },	/* 1.237 */
 	{  38, 512, 0x506312a44668d6a9, 0x0000015e1b5f6148 },	/* 1.242 */
 	{  39, 512, 0x71659ede62b4755f, 0x00000173ef029bcd },	/* 1.231 */
 	{  40, 512, 0xa7fde73fb74cf2d7, 0x000001866fb72748 },	/* 1.233 */
 	{  41, 512, 0x19e8b461a1dea1d3, 0x000001a046f76b23 },	/* 1.271 */
 	{  42, 512, 0x031c9b868cc3e976, 0x000001afa64c49d3 },	/* 1.263 */
 	{  43, 512, 0xbaa5125faa781854, 0x000001c76789e278 },	/* 1.270 */
 	{  44, 512, 0x4ed55052550d721b, 0x000001d800ccd8eb },	/* 1.281 */
 	{  45, 512, 0x0fd63ddbdff90677, 0x000001f08ad59ed2 },	/* 1.282 */
 	{  46, 512, 0x36d66546de7fdd6f, 0x000002016f09574b },	/* 1.286 */
 	{  47, 512, 0x99f997e7eafb69d7, 0x0000021e42e47cb6 },	/* 1.329 */
 	{  48, 512, 0xbecd9c2571312c5d, 0x000002320fe2872b },	/* 1.286 */
 	{  49, 512, 0xd97371329e488a32, 0x0000024cd73f2ca7 },	/* 1.322 */
 	{  50, 512, 0x30e9b136670749ee, 0x000002681c83b0e0 },	/* 1.335 */
 	{  51, 512, 0x11ad6bc8f47aaeb4, 0x0000027e9261b5d5 },	/* 1.305 */
 	{  52, 512, 0x68e445300af432c1, 0x0000029aa0eb7dbf },	/* 1.330 */
 	{  53, 512, 0x910fb561657ea98c, 0x000002b3dca04853 },	/* 1.365 */
 	{  54, 512, 0xd619693d8ce5e7a5, 0x000002cc280e9c97 },	/* 1.334 */
 	{  55, 512, 0x24e281f564dbb60a, 0x000002e9fa842713 },	/* 1.364 */
 	{  56, 512, 0x947a7d3bdaab44c5, 0x000003046680f72e },	/* 1.374 */
 	{  57, 512, 0x2d44fec9c093e0de, 0x00000324198ba810 },	/* 1.363 */
 	{  58, 512, 0x87743c272d29bb4c, 0x0000033ec48c9ac9 },	/* 1.401 */
 	{  59, 512, 0x96aa3b6f67f5d923, 0x0000034faead902c },	/* 1.392 */
 	{  60, 512, 0x94a4f1faf520b0d3, 0x0000037d713ab005 },	/* 1.360 */
 	{  61, 512, 0xb13ed3a272f711a2, 0x00000397368f3cbd },	/* 1.396 */
 	{  62, 512, 0x3b1b11805fa4a64a, 0x000003b8a5e2840c },	/* 1.453 */
 	{  63, 512, 0x4c74caad9172ba71, 0x000003d4be280290 },	/* 1.437 */
 	{  64, 512, 0x035ff643923dd29e, 0x000003fad6c355e1 },	/* 1.402 */
 	{  65, 512, 0x768e9171b11abd3c, 0x0000040eb07fed20 },	/* 1.459 */
 	{  66, 512, 0x75880e6f78a13ddd, 0x000004433d6acf14 },	/* 1.423 */
 	{  67, 512, 0x910b9714f698a877, 0x00000451ea65d5db },	/* 1.447 */
 	{  68, 512, 0x87f5db6f9fdcf5c7, 0x000004732169e3f7 },	/* 1.450 */
 	{  69, 512, 0x836d4968fbaa3706, 0x000004954068a380 },	/* 1.455 */
 	{  70, 512, 0xc567d73a036421ab, 0x000004bd7cb7bd3d },	/* 1.463 */
 	{  71, 512, 0x619df40f240b8fed, 0x000004e376c2e972 },	/* 1.463 */
 	{  72, 512, 0x42763a680d5bed8e, 0x000005084275c680 },	/* 1.452 */
 	{  73, 512, 0x5866f064b3230431, 0x0000052906f2c9ab },	/* 1.498 */
 	{  74, 512, 0x9fa08548b1621a44, 0x0000054708019247 },	/* 1.526 */
 	{  75, 512, 0xb6053078ce0fc303, 0x00000572cc5c72b0 },	/* 1.491 */
 	{  76, 512, 0x4a7aad7bf3890923, 0x0000058e987bc8e9 },	/* 1.470 */
 	{  77, 512, 0xe165613fd75b5a53, 0x000005c20473a211 },	/* 1.527 */
 	{  78, 512, 0x3ff154ac878163a6, 0x000005d659194bf3 },	/* 1.509 */
 	{  79, 512, 0x24b93ade0aa8a532, 0x0000060a201c4f8e },	/* 1.569 */
 	{  80, 512, 0xc18e2d14cd9bb554, 0x0000062c55cfe48c },	/* 1.555 */
 	{  81, 512, 0x98cc78302feb58b6, 0x0000066656a07194 },	/* 1.509 */
 	{  82, 512, 0xc6c5fd5a2abc0543, 0x0000067cff94fbf8 },	/* 1.596 */
 	{  83, 512, 0xa7962f514acbba21, 0x000006ab7b5afa2e },	/* 1.568 */
 	{  84, 512, 0xba02545069ddc6dc, 0x000006d19861364f },	/* 1.541 */
 	{  85, 512, 0x447c73192c35073e, 0x000006fce315ce35 },	/* 1.623 */
 	{  86, 512, 0x48beef9e2d42b0c2, 0x00000720a8e38b6b },	/* 1.620 */
 	{  87, 512, 0x4874cf98541a35e0, 0x00000758382a2273 },	/* 1.597 */
 	{  88, 512, 0xad4cf8333a31127a, 0x00000781e1651b1b },	/* 1.575 */
 	{  89, 512, 0x47ae4859d57888c1, 0x000007b27edbe5bc },	/* 1.627 */
 	{  90, 512, 0x06f7723cfe5d1891, 0x000007dc2a96d8eb },	/* 1.596 */
 	{  91, 512, 0xd4e44218d660576d, 0x0000080ac46f02d5 },	/* 1.622 */
 	{  92, 512, 0x7066702b0d5be1f2, 0x00000832c96d154e },	/* 1.695 */
 	{  93, 512, 0x011209b4f9e11fb9, 0x0000085eefda104c },	/* 1.605 */
 	{  94, 512, 0x47ffba30a0b35708, 0x00000899badc32dc },	/* 1.625 */
 	{  95, 512, 0x1a95a6ac4538aaa8, 0x000008b6b69a42b2 },	/* 1.687 */
 	{  96, 512, 0xbda2b239bb2008eb, 0x000008f22d2de38a },	/* 1.621 */
 	{  97, 512, 0x7ffa0bea90355c6c, 0x0000092e5b23b816 },	/* 1.699 */
 	{  98, 512, 0x1d56ba34be426795, 0x0000094f482e5d1b },	/* 1.688 */
 	{  99, 512, 0x0aa89d45c502e93d, 0x00000977d94a98ce },	/* 1.642 */
 	{ 100, 512, 0x54369449f6857774, 0x000009c06c9b34cc },	/* 1.683 */
 	{ 101, 512, 0xf7d4dd8445b46765, 0x000009e5dc542259 },	/* 1.755 */
 	{ 102, 512, 0xfa8866312f169469, 0x00000a16b54eae93 },	/* 1.692 */
 	{ 103, 512, 0xd8a5aea08aef3ff9, 0x00000a381d2cbfe7 },	/* 1.747 */
 	{ 104, 512, 0x66bcd2c3d5f9ef0e, 0x00000a8191817be7 },	/* 1.751 */
 	{ 105, 512, 0x3fb13a47a012ec81, 0x00000ab562b9a254 },	/* 1.751 */
 	{ 106, 512, 0x43100f01c9e5e3ca, 0x00000aeee84c185f },	/* 1.726 */
 	{ 107, 512, 0xca09c50ccee2d054, 0x00000b1c359c047d },	/* 1.788 */
 	{ 108, 512, 0xd7176732ac503f9b, 0x00000b578bc52a73 },	/* 1.740 */
 	{ 109, 512, 0xed206e51f8d9422d, 0x00000b8083e0d960 },	/* 1.780 */
 	{ 110, 512, 0x17ead5dc6ba0dcd6, 0x00000bcfb1a32ca8 },	/* 1.836 */
 	{ 111, 512, 0x5f1dc21e38a969eb, 0x00000c0171becdd6 },	/* 1.778 */
 	{ 112, 512, 0xddaa973de33ec528, 0x00000c3edaba4b95 },	/* 1.831 */
 	{ 113, 512, 0x2a5eccd7735a3630, 0x00000c630664e7df },	/* 1.825 */
 	{ 114, 512, 0xafcccee5c0b71446, 0x00000cb65392f6e4 },	/* 1.826 */
 	{ 115, 512, 0x8fa30c5e7b147e27, 0x00000cd4db391e55 },	/* 1.843 */
 	{ 116, 512, 0x5afe0711fdfafd82, 0x00000d08cb4ec35d },	/* 1.826 */
 	{ 117, 512, 0x533a6090238afd4c, 0x00000d336f115d1b },	/* 1.803 */
 	{ 118, 512, 0x90cf11b595e39a84, 0x00000d8e041c2048 },	/* 1.857 */
 	{ 119, 512, 0x0d61a3b809444009, 0x00000dcb798afe35 },	/* 1.877 */
 	{ 120, 512, 0x7f34da0f54b0d114, 0x00000df3922664e1 },	/* 1.849 */
 	{ 121, 512, 0xa52258d5b72f6551, 0x00000e4d37a9872d },	/* 1.867 */
 	{ 122, 512, 0xc1de54d7672878db, 0x00000e6583a94cf6 },	/* 1.978 */
 	{ 123, 512, 0x1d03354316a414ab, 0x00000ebffc50308d },	/* 1.947 */
 	{ 124, 512, 0xcebdcc377665412c, 0x00000edee1997cea },	/* 1.865 */
 	{ 125, 512, 0x4ddd4c04b1a12344, 0x00000f21d64b373f },	/* 1.881 */
 	{ 126, 512, 0x64fc8f94e3973658, 0x00000f8f87a8896b },	/* 1.882 */
 	{ 127, 512, 0x68765f78034a334e, 0x00000fb8fe62197e },	/* 1.867 */
 	{ 128, 512, 0xaf36b871a303e816, 0x00000fec6f3afb1e },	/* 1.972 */
 	{ 129, 512, 0x2a4cbf73866c3a28, 0x00001027febfe4e5 },	/* 1.896 */
 	{ 130, 512, 0x9cb128aacdcd3b2f, 0x0000106aa8ac569d },	/* 1.965 */
 	{ 131, 512, 0x5511d41c55869124, 0x000010bbd755ddf1 },	/* 1.963 */
 	{ 132, 512, 0x42f92461937f284a, 0x000010fb8bceb3b5 },	/* 1.925 */
 	{ 133, 512, 0xe2d89a1cf6f1f287, 0x0000114cf5331e34 },	/* 1.862 */
 	{ 134, 512, 0xdc631a038956200e, 0x0000116428d2adc5 },	/* 2.042 */
 	{ 135, 512, 0xb2e5ac222cd236be, 0x000011ca88e4d4d2 },	/* 1.935 */
 	{ 136, 512, 0xbc7d8236655d88e7, 0x000011e39cb94e66 },	/* 2.005 */
 	{ 137, 512, 0x073e02d88d2d8e75, 0x0000123136c7933c },	/* 2.041 */
 	{ 138, 512, 0x3ddb9c3873166be0, 0x00001280e4ec6d52 },	/* 1.997 */
 	{ 139, 512, 0x7d3b1a845420e1b5, 0x000012c2e7cd6a44 },	/* 1.996 */
 	{ 140, 512, 0x60102308aa7b2a6c, 0x000012fc490e6c7d },	/* 2.053 */
 	{ 141, 512, 0xdb22bb2f9eb894aa, 0x00001343f5a85a1a },	/* 1.971 */
 	{ 142, 512, 0xd853f879a13b1606, 0x000013bb7d5f9048 },	/* 2.018 */
 	{ 143, 512, 0x001620a03f804b1d, 0x000013e74cc794fd },	/* 1.961 */
 	{ 144, 512, 0xfdb52dda76fbf667, 0x00001442d2f22480 },	/* 2.046 */
 	{ 145, 512, 0xa9160110f66e24ff, 0x0000144b899f9dbb },	/* 1.968 */
 	{ 146, 512, 0x77306a30379ae03b, 0x000014cb98eb1f81 },	/* 2.143 */
 	{ 147, 512, 0x14f5985d2752319d, 0x000014feab821fc9 },	/* 2.064 */
 	{ 148, 512, 0xa4b8ff11de7863f8, 0x0000154a0e60b9c9 },	/* 2.023 */
 	{ 149, 512, 0x44b345426455c1b3, 0x000015999c3c569c },	/* 2.136 */
 	{ 150, 512, 0x272677826049b46c, 0x000015c9697f4b92 },	/* 2.063 */
 	{ 151, 512, 0x2f9216e2cd74fe40, 0x0000162b1f7bbd39 },	/* 1.974 */
 	{ 152, 512, 0x706ae3e763ad8771, 0x00001661371c55e1 },	/* 2.210 */
 	{ 153, 512, 0xf7fd345307c2480e, 0x000016e251f28b6a },	/* 2.006 */
 	{ 154, 512, 0x6e94e3d26b3139eb, 0x000016f2429bb8c6 },	/* 2.193 */
 	{ 155, 512, 0x5458bbfbb781fcba, 0x0000173efdeca1b9 },	/* 2.163 */
 	{ 156, 512, 0xa80e2afeccd93b33, 0x000017bfdcb78adc },	/* 2.046 */
 	{ 157, 512, 0x1e4ccbb22796cf9d, 0x00001826fdcc39c9 },	/* 2.084 */
 	{ 158, 512, 0x8fba4b676aaa3663, 0x00001841a1379480 },	/* 2.264 */
 	{ 159, 512, 0xf82b843814b315fa, 0x000018886e19b8a3 },	/* 2.074 */
 	{ 160, 512, 0x7f21e920ecf753a3, 0x0000191812ca0ea7 },	/* 2.282 */
 	{ 161, 512, 0x48bb8ea2c4caa620, 0x0000192f310faccf },	/* 2.148 */
 	{ 162, 512, 0x5cdb652b4952c91b, 0x0000199e1d7437c7 },	/* 2.355 */
 	{ 163, 512, 0x6ac1ba6f78c06cd4, 0x000019cd11f82c70 },	/* 2.164 */
 	{ 164, 512, 0x9faf5f9ca2669a56, 0x00001a18d5431f6a },	/* 2.393 */
 	{ 165, 512, 0xaa57e9383eb01194, 0x00001a9e7d253d85 },	/* 2.178 */
 	{ 166, 512, 0x896967bf495c34d2, 0x00001afb8319b9fc },	/* 2.334 */
 	{ 167, 512, 0xdfad5f05de225f1b, 0x00001b3a59c3093b },	/* 2.266 */
 	{ 168, 512, 0xfd299a99f9f2abdd, 0x00001bb6f1a10799 },	/* 2.304 */
 	{ 169, 512, 0xdda239e798fe9fd4, 0x00001bfae0c9692d },	/* 2.218 */
 	{ 170, 512, 0x5fca670414a32c3e, 0x00001c22129dbcff },	/* 2.377 */
 	{ 171, 512, 0x1bb8934314b087de, 0x00001c955db36cd0 },	/* 2.155 */
 	{ 172, 512, 0xd96394b4b082200d, 0x00001cfc8619b7e6 },	/* 2.404 */
 	{ 173, 512, 0xb612a7735b1c8cbc, 0x00001d303acdd585 },	/* 2.205 */
 	{ 174, 512, 0x28e7430fe5875fe1, 0x00001d7ed5b3697d },	/* 2.359 */
 	{ 175, 512, 0x5038e89efdd981b9, 0x00001dc40ec35c59 },	/* 2.158 */
 	{ 176, 512, 0x075fd78f1d14db7c, 0x00001e31c83b4a2b },	/* 2.614 */
 	{ 177, 512, 0xc50fafdb5021be15, 0x00001e7cdac82fbc },	/* 2.239 */
 	{ 178, 512, 0xe6dc7572ce7b91c7, 0x00001edd8bb454fc },	/* 2.493 */
 	{ 179, 512, 0x21f7843e7beda537, 0x00001f3a8e019d6c },	/* 2.327 */
 	{ 180, 512, 0xc83385e20b43ec82, 0x00001f70735ec137 },	/* 2.231 */
 	{ 181, 512, 0xca818217dddb21fd, 0x0000201ca44c5a3c },	/* 2.237 */
 	{ 182, 512, 0xe6035defea48f933, 0x00002038e3346658 },	/* 2.691 */
 	{ 183, 512, 0x47262a4f953dac5a, 0x000020c2e554314e },	/* 2.170 */
 	{ 184, 512, 0xe24c7246260873ea, 0x000021197e618d64 },	/* 2.600 */
 	{ 185, 512, 0xeef6b57c9b58e9e1, 0x0000217ea48ecddc },	/* 2.391 */
 	{ 186, 512, 0x2becd3346e386142, 0x000021c496d4a5f9 },	/* 2.677 */
 	{ 187, 512, 0x63c6207bdf3b40a3, 0x0000220e0f2eec0c },	/* 2.410 */
 	{ 188, 512, 0x3056ce8989767d4b, 0x0000228eb76cd137 },	/* 2.776 */
 	{ 189, 512, 0x91af61c307cee780, 0x000022e17e2ea501 },	/* 2.266 */
 	{ 190, 512, 0xda359da225f6d54f, 0x00002358a2debc19 },	/* 2.717 */
 	{ 191, 512, 0x0a5f7a2a55607ba0, 0x0000238a79dac18c },	/* 2.474 */
 	{ 192, 512, 0x27bb75bf5224638a, 0x00002403a58e2351 },	/* 2.673 */
 	{ 193, 512, 0x1ebfdb94630f5d0f, 0x00002492a10cb339 },	/* 2.420 */
 	{ 194, 512, 0x6eae5e51d9c5f6fb, 0x000024ce4bf98715 },	/* 2.898 */
 	{ 195, 512, 0x08d903b4daedc2e0, 0x0000250d1e15886c },	/* 2.363 */
 	{ 196, 512, 0xc722a2f7fa7cd686, 0x0000258a99ed0c9e },	/* 2.747 */
 	{ 197, 512, 0x8f71faf0e54e361d, 0x000025dee11976f5 },	/* 2.531 */
 	{ 198, 512, 0x87f64695c91a54e7, 0x0000264e00a43da0 },	/* 2.707 */
 	{ 199, 512, 0xc719cbac2c336b92, 0x000026d327277ac1 },	/* 2.315 */
 	{ 200, 512, 0xe7e647afaf771ade, 0x000027523a5c44bf },	/* 3.012 */
 	{ 201, 512, 0x12d4b5c38ce8c946, 0x0000273898432545 },	/* 2.378 */
 	{ 202, 512, 0xf2e0cd4067bdc94a, 0x000027e47bb2c935 },	/* 2.969 */
 	{ 203, 512, 0x21b79f14d6d947d3, 0x0000281e64977f0d },	/* 2.594 */
 	{ 204, 512, 0x515093f952f18cd6, 0x0000289691a473fd },	/* 2.763 */
 	{ 205, 512, 0xd47b160a1b1022c8, 0x00002903e8b52411 },	/* 2.457 */
 	{ 206, 512, 0xc02fc96684715a16, 0x0000297515608601 },	/* 3.057 */
 	{ 207, 512, 0xef51e68efba72ed0, 0x000029ef73604804 },	/* 2.590 */
 	{ 208, 512, 0x9e3be6e5448b4f33, 0x00002a2846ed074b },	/* 3.047 */
 	{ 209, 512, 0x81d446c6d5fec063, 0x00002a92ca693455 },	/* 2.676 */
 	{ 210, 512, 0xff215de8224e57d5, 0x00002b2271fe3729 },	/* 2.993 */
 	{ 211, 512, 0xe2524d9ba8f69796, 0x00002b64b99c3ba2 },	/* 2.457 */
 	{ 212, 512, 0xf6b28e26097b7e4b, 0x00002bd768b6e068 },	/* 3.182 */
 	{ 213, 512, 0x893a487f30ce1644, 0x00002c67f722b4b2 },	/* 2.563 */
 	{ 214, 512, 0x386566c3fc9871df, 0x00002cc1cf8b4037 },	/* 3.025 */
 	{ 215, 512, 0x1e0ed78edf1f558a, 0x00002d3948d36c7f },	/* 2.730 */
 	{ 216, 512, 0xe3bc20c31e61f113, 0x00002d6d6b12e025 },	/* 3.036 */
 	{ 217, 512, 0xd6c3ad2e23021882, 0x00002deff7572241 },	/* 2.722 */
 	{ 218, 512, 0xb4a9f95cf0f69c5a, 0x00002e67d537aa36 },	/* 3.356 */
 	{ 219, 512, 0x6e98ed6f6c38e82f, 0x00002e9720626789 },	/* 2.697 */
 	{ 220, 512, 0x2e01edba33fddac7, 0x00002f407c6b0198 },	/* 2.979 */
 	{ 221, 512, 0x559d02e1f5f57ccc, 0x00002fb6a5ab4f24 },	/* 2.858 */
 	{ 222, 512, 0xac18f5a916adcd8e, 0x0000304ae1c5c57e },	/* 3.258 */
 	{ 223, 512, 0x15789fbaddb86f4b, 0x0000306f6e019c78 },	/* 2.693 */
 	{ 224, 512, 0xf4a9c36d5bc4c408, 0x000030da40434213 },	/* 3.259 */
 	{ 225, 512, 0xf640f90fd2727f44, 0x00003189ed37b90c },	/* 2.733 */
 	{ 226, 512, 0xb5313d390d61884a, 0x000031e152616b37 },	/* 3.235 */
 	{ 227, 512, 0x4bae6b3ce9160939, 0x0000321f40aeac42 },	/* 2.983 */
 	{ 228, 512, 0x838c34480f1a66a1, 0x000032f389c0f78e },	/* 3.308 */
 	{ 229, 512, 0xb1c4a52c8e3d6060, 0x0000330062a40284 },	/* 2.715 */
 	{ 230, 512, 0xe0f1110c6d0ed822, 0x0000338be435644f },	/* 3.540 */
 	{ 231, 512, 0x9f1a8ccdcea68d4b, 0x000034045a4e97e1 },	/* 2.779 */
 	{ 232, 512, 0x3261ed62223f3099, 0x000034702cfc401c },	/* 3.084 */
 	{ 233, 512, 0xf2191e2311022d65, 0x00003509dd19c9fc },	/* 2.987 */
 	{ 234, 512, 0xf102a395c2033abc, 0x000035654dc96fae },	/* 3.341 */
 	{ 235, 512, 0x11fe378f027906b6, 0x000035b5193b0264 },	/* 2.793 */
 	{ 236, 512, 0xf777f2c026b337aa, 0x000036704f5d9297 },	/* 3.518 */
 	{ 237, 512, 0x1b04e9c2ee143f32, 0x000036dfbb7af218 },	/* 2.962 */
 	{ 238, 512, 0x2fcec95266f9352c, 0x00003785c8df24a9 },	/* 3.196 */
 	{ 239, 512, 0xfe2b0e47e427dd85, 0x000037cbdf5da729 },	/* 2.914 */
 	{ 240, 512, 0x72b49bf2225f6c6d, 0x0000382227c15855 },	/* 3.408 */
 	{ 241, 512, 0x50486b43df7df9c7, 0x0000389b88be6453 },	/* 2.903 */
 	{ 242, 512, 0x5192a3e53181c8ab, 0x000038ddf3d67263 },	/* 3.778 */
 	{ 243, 512, 0xe9f5d8365296fd5e, 0x0000399f1c6c9e9c },	/* 3.026 */
 	{ 244, 512, 0xc740263f0301efa8, 0x00003a147146512d },	/* 3.347 */
 	{ 245, 512, 0x23cd0f2b5671e67d, 0x00003ab10bcc0d9d },	/* 3.212 */
 	{ 246, 512, 0x002ccc7e5cd41390, 0x00003ad6cd14a6c0 },	/* 3.482 */
 	{ 247, 512, 0x9aafb3c02544b31b, 0x00003b8cb8779fb0 },	/* 3.146 */
 	{ 248, 512, 0x72ba07a78b121999, 0x00003c24142a5a3f },	/* 3.626 */
 	{ 249, 512, 0x3d784aa58edfc7b4, 0x00003cd084817d99 },	/* 2.952 */
 	{ 250, 512, 0xaab750424d8004af, 0x00003d506a8e098e },	/* 3.463 */
 	{ 251, 512, 0x84403fcf8e6b5ca2, 0x00003d4c54c2aec4 },	/* 3.131 */
 	{ 252, 512, 0x71eb7455ec98e207, 0x00003e655715cf2c },	/* 3.538 */
 	{ 253, 512, 0xd752b4f19301595b, 0x00003ecd7b2ca5ac },	/* 2.974 */
 	{ 254, 512, 0xc4674129750499de, 0x00003e99e86d3e95 },	/* 3.843 */
 	{ 255, 512, 0x9772baff5cd12ef5, 0x00003f895c019841 },	/* 3.088 */
 };
 
 /*
  * Verify the map is valid. Each device index must appear exactly
  * once in every row, and the permutation array checksum must match.
  */
 static int
 verify_perms(uint8_t *perms, uint64_t children, uint64_t nperms,
     uint64_t checksum)
 {
 	int countssz = sizeof (uint16_t) * children;
 	uint16_t *counts = kmem_zalloc(countssz, KM_SLEEP);
 
 	for (int i = 0; i < nperms; i++) {
 		for (int j = 0; j < children; j++) {
 			uint8_t val = perms[(i * children) + j];
 
 			if (val >= children || counts[val] != i) {
 				kmem_free(counts, countssz);
 				return (EINVAL);
 			}
 
 			counts[val]++;
 		}
 	}
 
 	if (checksum != 0) {
 		int permssz = sizeof (uint8_t) * children * nperms;
 		zio_cksum_t cksum;
 
 		fletcher_4_native_varsize(perms, permssz, &cksum);
 
 		if (checksum != cksum.zc_word[0]) {
 			kmem_free(counts, countssz);
 			return (ECKSUM);
 		}
 	}
 
 	kmem_free(counts, countssz);
 
 	return (0);
 }
 
 /*
  * Generate the permutation array for the draid_map_t.  These maps control
  * the placement of all data in a dRAID.  Therefore it's critical that the
  * seed always generates the same mapping.  We provide our own pseudo-random
  * number generator for this purpose.
  */
 int
 vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp)
 {
 	VERIFY3U(map->dm_children, >=, VDEV_DRAID_MIN_CHILDREN);
 	VERIFY3U(map->dm_children, <=, VDEV_DRAID_MAX_CHILDREN);
 	VERIFY3U(map->dm_seed, !=, 0);
 	VERIFY3U(map->dm_nperms, !=, 0);
 	VERIFY3P(map->dm_perms, ==, NULL);
 
 #ifdef _KERNEL
 	/*
 	 * The kernel code always provides both a map_seed and checksum.
 	 * Only the tests/zfs-tests/cmd/draid/draid.c utility will provide
 	 * a zero checksum when generating new candidate maps.
 	 */
 	VERIFY3U(map->dm_checksum, !=, 0);
 #endif
 	uint64_t children = map->dm_children;
 	uint64_t nperms = map->dm_nperms;
 	int rowsz = sizeof (uint8_t) * children;
 	int permssz = rowsz * nperms;
 	uint8_t *perms;
 
 	/* Allocate the permutation array */
 	perms = vmem_alloc(permssz, KM_SLEEP);
 
 	/* Setup an initial row with a known pattern */
 	uint8_t *initial_row = kmem_alloc(rowsz, KM_SLEEP);
 	for (int i = 0; i < children; i++)
 		initial_row[i] = i;
 
 	uint64_t draid_seed[2] = { VDEV_DRAID_SEED, map->dm_seed };
 	uint8_t *current_row, *previous_row = initial_row;
 
 	/*
 	 * Perform a Fisher-Yates shuffle of each row using the previous
 	 * row as the starting point.  An initial_row with known pattern
 	 * is used as the input for the first row.
 	 */
 	for (int i = 0; i < nperms; i++) {
 		current_row = &perms[i * children];
 		memcpy(current_row, previous_row, rowsz);
 
 		for (int j = children - 1; j > 0; j--) {
 			uint64_t k = vdev_draid_rand(draid_seed) % (j + 1);
 			uint8_t val = current_row[j];
 			current_row[j] = current_row[k];
 			current_row[k] = val;
 		}
 
 		previous_row = current_row;
 	}
 
 	kmem_free(initial_row, rowsz);
 
 	int error = verify_perms(perms, children, nperms, map->dm_checksum);
 	if (error) {
 		vmem_free(perms, permssz);
 		return (error);
 	}
 
 	*permsp = perms;
 
 	return (0);
 }
 
 /*
  * Lookup the fixed draid_map_t for the requested number of children.
  */
 int
 vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp)
 {
 	for (int i = 0; i < VDEV_DRAID_MAX_MAPS; i++) {
 		if (draid_maps[i].dm_children == children) {
 			*mapp = &draid_maps[i];
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 /*
  * Lookup the permutation array and iteration id for the provided offset.
  */
 static void
 vdev_draid_get_perm(vdev_draid_config_t *vdc, uint64_t pindex,
     uint8_t **base, uint64_t *iter)
 {
 	uint64_t ncols = vdc->vdc_children;
 	uint64_t poff = pindex % (vdc->vdc_nperms * ncols);
 
 	*base = vdc->vdc_perms + (poff / ncols) * ncols;
 	*iter = poff % ncols;
 }
 
 static inline uint64_t
 vdev_draid_permute_id(vdev_draid_config_t *vdc,
     uint8_t *base, uint64_t iter, uint64_t index)
 {
 	return ((base[index] + iter) % vdc->vdc_children);
 }
 
 /*
  * Return the asize which is the psize rounded up to a full group width.
  * i.e. vdev_draid_psize_to_asize().
  */
 static uint64_t
 vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
 	(void) txg;
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 	uint64_t ashift = vd->vdev_ashift;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	uint64_t rows = ((psize - 1) / (vdc->vdc_ndata << ashift)) + 1;
 	uint64_t asize = (rows * vdc->vdc_groupwidth) << ashift;
 
 	ASSERT3U(asize, !=, 0);
 	ASSERT3U(asize % (vdc->vdc_groupwidth), ==, 0);
 
 	return (asize);
 }
 
 /*
  * Deflate the asize to the psize, this includes stripping parity.
  */
 uint64_t
 vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT0(asize % vdc->vdc_groupwidth);
 
 	return ((asize / vdc->vdc_groupwidth) * vdc->vdc_ndata);
 }
 
 /*
  * Convert a logical offset to the corresponding group number.
  */
 static uint64_t
 vdev_draid_offset_to_group(vdev_t *vd, uint64_t offset)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	return (offset / vdc->vdc_groupsz);
 }
 
 /*
  * Convert a group number to the logical starting offset for that group.
  */
 static uint64_t
 vdev_draid_group_to_offset(vdev_t *vd, uint64_t group)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	return (group * vdc->vdc_groupsz);
 }
 
 /*
  * Full stripe writes.  When writing, all columns (D+P) are required.  Parity
  * is calculated over all the columns, including empty zero filled sectors,
  * and each is written to disk.  While only the data columns are needed for
  * a normal read, all of the columns are required for reconstruction when
  * performing a sequential resilver.
  *
  * For "big columns" it's sufficient to map the correct range of the zio ABD.
  * Partial columns require allocating a gang ABD in order to zero fill the
  * empty sectors.  When the column is empty a zero filled sector must be
  * mapped.  In all cases the data ABDs must be the same size as the parity
  * ABDs (e.g. rc->rc_size == parity_size).
  */
 static void
 vdev_draid_map_alloc_write(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
 {
 	uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
 	uint64_t parity_size = rr->rr_col[0].rc_size;
 	uint64_t abd_off = abd_offset;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 	ASSERT3U(parity_size, ==, abd_get_size(rr->rr_col[0].rc_abd));
 
 	for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_size == 0) {
 			/* empty data column (small write), add a skip sector */
 			ASSERT3U(skip_size, ==, parity_size);
 			rc->rc_abd = abd_get_zeros(skip_size);
 		} else if (rc->rc_size == parity_size) {
 			/* this is a "big column" */
 			rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
 			    zio->io_abd, abd_off, rc->rc_size);
 		} else {
 			/* short data column, add a skip sector */
 			ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
 			rc->rc_abd = abd_alloc_gang();
 			abd_gang_add(rc->rc_abd, abd_get_offset_size(
 			    zio->io_abd, abd_off, rc->rc_size), B_TRUE);
 			abd_gang_add(rc->rc_abd, abd_get_zeros(skip_size),
 			    B_TRUE);
 		}
 
 		ASSERT3U(abd_get_size(rc->rc_abd), ==, parity_size);
 
 		abd_off += rc->rc_size;
 		rc->rc_size = parity_size;
 	}
 
 	IMPLY(abd_offset != 0, abd_off == zio->io_size);
 }
 
 /*
  * Scrub/resilver reads.  In order to store the contents of the skip sectors
  * an additional ABD is allocated.  The columns are handled in the same way
  * as a full stripe write except instead of using the zero ABD the newly
  * allocated skip ABD is used to back the skip sectors.  In all cases the
  * data ABD must be the same size as the parity ABDs.
  */
 static void
 vdev_draid_map_alloc_scrub(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
 {
 	uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
 	uint64_t parity_size = rr->rr_col[0].rc_size;
 	uint64_t abd_off = abd_offset;
 	uint64_t skip_off = 0;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 	ASSERT3P(rr->rr_abd_empty, ==, NULL);
 
 	if (rr->rr_nempty > 0) {
 		rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size,
 		    B_FALSE);
 	}
 
 	for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_size == 0) {
 			/* empty data column (small read), add a skip sector */
 			ASSERT3U(skip_size, ==, parity_size);
 			ASSERT3U(rr->rr_nempty, !=, 0);
 			rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty,
 			    skip_off, skip_size);
 			skip_off += skip_size;
 		} else if (rc->rc_size == parity_size) {
 			/* this is a "big column" */
 			rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
 			    zio->io_abd, abd_off, rc->rc_size);
 		} else {
 			/* short data column, add a skip sector */
 			ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
 			ASSERT3U(rr->rr_nempty, !=, 0);
 			rc->rc_abd = abd_alloc_gang();
 			abd_gang_add(rc->rc_abd, abd_get_offset_size(
 			    zio->io_abd, abd_off, rc->rc_size), B_TRUE);
 			abd_gang_add(rc->rc_abd, abd_get_offset_size(
 			    rr->rr_abd_empty, skip_off, skip_size), B_TRUE);
 			skip_off += skip_size;
 		}
 
 		uint64_t abd_size = abd_get_size(rc->rc_abd);
 		ASSERT3U(abd_size, ==, abd_get_size(rr->rr_col[0].rc_abd));
 
 		/*
 		 * Increase rc_size so the skip ABD is included in subsequent
 		 * parity calculations.
 		 */
 		abd_off += rc->rc_size;
 		rc->rc_size = abd_size;
 	}
 
 	IMPLY(abd_offset != 0, abd_off == zio->io_size);
 	ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size);
 }
 
 /*
  * Normal reads.  In this common case only the columns containing data
  * are read in to the zio ABDs.  Neither the parity columns or empty skip
  * sectors are read unless the checksum fails verification.  In which case
  * vdev_raidz_read_all() will call vdev_draid_map_alloc_empty() to expand
  * the raid map in order to allow reconstruction using the parity data and
  * skip sectors.
  */
 static void
 vdev_draid_map_alloc_read(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
 {
 	uint64_t abd_off = abd_offset;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 
 	for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_size > 0) {
 			rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
 			    zio->io_abd, abd_off, rc->rc_size);
 			abd_off += rc->rc_size;
 		}
 	}
 
 	IMPLY(abd_offset != 0, abd_off == zio->io_size);
 }
 
 /*
  * Converts a normal "read" raidz_row_t to a "scrub" raidz_row_t. The key
  * difference is that an ABD is allocated to back skip sectors so they may
  * be read in to memory, verified, and repaired if needed.
  */
 void
 vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr)
 {
 	uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
 	uint64_t parity_size = rr->rr_col[0].rc_size;
 	uint64_t skip_off = 0;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 	ASSERT3P(rr->rr_abd_empty, ==, NULL);
 
 	if (rr->rr_nempty > 0) {
 		rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size,
 		    B_FALSE);
 	}
 
 	for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_size == 0) {
 			/* empty data column (small read), add a skip sector */
 			ASSERT3U(skip_size, ==, parity_size);
 			ASSERT3U(rr->rr_nempty, !=, 0);
 			ASSERT3P(rc->rc_abd, ==, NULL);
 			rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty,
 			    skip_off, skip_size);
 			skip_off += skip_size;
 		} else if (rc->rc_size == parity_size) {
 			/* this is a "big column", nothing to add */
 			ASSERT3P(rc->rc_abd, !=, NULL);
 		} else {
 			/*
 			 * short data column, add a skip sector and clear
 			 * rc_tried to force the entire column to be re-read
 			 * thereby including the missing skip sector data
 			 * which is needed for reconstruction.
 			 */
 			ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
 			ASSERT3U(rr->rr_nempty, !=, 0);
 			ASSERT3P(rc->rc_abd, !=, NULL);
 			ASSERT(!abd_is_gang(rc->rc_abd));
 			abd_t *read_abd = rc->rc_abd;
 			rc->rc_abd = abd_alloc_gang();
 			abd_gang_add(rc->rc_abd, read_abd, B_TRUE);
 			abd_gang_add(rc->rc_abd, abd_get_offset_size(
 			    rr->rr_abd_empty, skip_off, skip_size), B_TRUE);
 			skip_off += skip_size;
 			rc->rc_tried = 0;
 		}
 
 		/*
 		 * Increase rc_size so the empty ABD is included in subsequent
 		 * parity calculations.
 		 */
 		rc->rc_size = parity_size;
 	}
 
 	ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size);
 }
 
 /*
  * Verify that all empty sectors are zero filled before using them to
  * calculate parity.  Otherwise, silent corruption in an empty sector will
  * result in bad parity being generated.  That bad parity will then be
  * considered authoritative and overwrite the good parity on disk.  This
  * is possible because the checksum is only calculated over the data,
  * thus it cannot be used to detect damage in empty sectors.
  */
 int
 vdev_draid_map_verify_empty(zio_t *zio, raidz_row_t *rr)
 {
 	uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
 	uint64_t parity_size = rr->rr_col[0].rc_size;
 	uint64_t skip_off = parity_size - skip_size;
 	uint64_t empty_off = 0;
 	int ret = 0;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 	ASSERT3P(rr->rr_abd_empty, !=, NULL);
 	ASSERT3U(rr->rr_bigcols, >, 0);
 
 	void *zero_buf = kmem_zalloc(skip_size, KM_SLEEP);
 
 	for (int c = rr->rr_bigcols; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		ASSERT3P(rc->rc_abd, !=, NULL);
 		ASSERT3U(rc->rc_size, ==, parity_size);
 
 		if (abd_cmp_buf_off(rc->rc_abd, zero_buf, skip_off,
 		    skip_size) != 0) {
 			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
 			abd_zero_off(rc->rc_abd, skip_off, skip_size);
 			rc->rc_error = SET_ERROR(ECKSUM);
 			ret++;
 		}
 
 		empty_off += skip_size;
 	}
 
 	ASSERT3U(empty_off, ==, abd_get_size(rr->rr_abd_empty));
 
 	kmem_free(zero_buf, skip_size);
 
 	return (ret);
 }
 
 /*
  * Given a logical address within a dRAID configuration, return the physical
  * address on the first drive in the group that this address maps to
  * (at position 'start' in permutation number 'perm').
  */
 static uint64_t
 vdev_draid_logical_to_physical(vdev_t *vd, uint64_t logical_offset,
     uint64_t *perm, uint64_t *start)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	/* b is the dRAID (parent) sector offset. */
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	uint64_t b_offset = logical_offset >> ashift;
 
 	/*
 	 * The height of a row in units of the vdev's minimum sector size.
 	 * This is the amount of data written to each disk of each group
 	 * in a given permutation.
 	 */
 	uint64_t rowheight_sectors = VDEV_DRAID_ROWHEIGHT >> ashift;
 
 	/*
 	 * We cycle through a disk permutation every groupsz * ngroups chunk
 	 * of address space. Note that ngroups * groupsz must be a multiple
 	 * of the number of data drives (ndisks) in order to guarantee
 	 * alignment. So, for example, if our row height is 16MB, our group
 	 * size is 10, and there are 13 data drives in the draid, then ngroups
 	 * will be 13, we will change permutation every 2.08GB and each
 	 * disk will have 160MB of data per chunk.
 	 */
 	uint64_t groupwidth = vdc->vdc_groupwidth;
 	uint64_t ngroups = vdc->vdc_ngroups;
 	uint64_t ndisks = vdc->vdc_ndisks;
 
 	/*
 	 * groupstart is where the group this IO will land in "starts" in
 	 * the permutation array.
 	 */
 	uint64_t group = logical_offset / vdc->vdc_groupsz;
 	uint64_t groupstart = (group * groupwidth) % ndisks;
 	ASSERT3U(groupstart + groupwidth, <=, ndisks + groupstart);
 	*start = groupstart;
 
 	/* b_offset is the sector offset within a group chunk */
 	b_offset = b_offset % (rowheight_sectors * groupwidth);
 	ASSERT0(b_offset % groupwidth);
 
 	/*
 	 * Find the starting byte offset on each child vdev:
 	 * - within a permutation there are ngroups groups spread over the
 	 *   rows, where each row covers a slice portion of the disk
 	 * - each permutation has (groupwidth * ngroups) / ndisks rows
 	 * - so each permutation covers rows * slice portion of the disk
 	 * - so we need to find the row where this IO group target begins
 	 */
 	*perm = group / ngroups;
 	uint64_t row = (*perm * ((groupwidth * ngroups) / ndisks)) +
 	    (((group % ngroups) * groupwidth) / ndisks);
 
 	return (((rowheight_sectors * row) +
 	    (b_offset / groupwidth)) << ashift);
 }
 
 static uint64_t
 vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
     uint64_t abd_offset, uint64_t abd_size)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	uint64_t io_size = abd_size;
 	uint64_t io_asize = vdev_draid_asize(vd, io_size, 0);
 	uint64_t group = vdev_draid_offset_to_group(vd, io_offset);
 	uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1);
 
 	/*
 	 * Limit the io_size to the space remaining in the group.  A second
 	 * row in the raidz_map_t is created for the remainder.
 	 */
 	if (io_offset + io_asize > start_offset) {
 		io_size = vdev_draid_asize_to_psize(vd,
 		    start_offset - io_offset);
 	}
 
 	/*
 	 * At most a block may span the logical end of one group and the start
 	 * of the next group. Therefore, at the end of a group the io_size must
 	 * span the group width evenly and the remainder must be aligned to the
 	 * start of the next group.
 	 */
 	IMPLY(abd_offset == 0 && io_size < zio->io_size,
 	    (io_asize >> ashift) % vdc->vdc_groupwidth == 0);
 	IMPLY(abd_offset != 0,
 	    vdev_draid_group_to_offset(vd, group) == io_offset);
 
 	/* Lookup starting byte offset on each child vdev */
 	uint64_t groupstart, perm;
 	uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
 	    io_offset, &perm, &groupstart);
 
 	/*
 	 * If there is less than groupwidth drives available after the group
 	 * start, the group is going to wrap onto the next row. 'wrap' is the
 	 * group disk number that starts on the next row.
 	 */
 	uint64_t ndisks = vdc->vdc_ndisks;
 	uint64_t groupwidth = vdc->vdc_groupwidth;
 	uint64_t wrap = groupwidth;
 
 	if (groupstart + groupwidth > ndisks)
 		wrap = ndisks - groupstart;
 
 	/* The io size in units of the vdev's minimum sector size. */
 	const uint64_t psize = io_size >> ashift;
 
 	/*
 	 * "Quotient": The number of data sectors for this stripe on all but
 	 * the "big column" child vdevs that also contain "remainder" data.
 	 */
 	uint64_t q = psize / vdc->vdc_ndata;
 
 	/*
 	 * "Remainder": The number of partial stripe data sectors in this I/O.
 	 * This will add a sector to some, but not all, child vdevs.
 	 */
 	uint64_t r = psize - q * vdc->vdc_ndata;
 
 	/* The number of "big columns" - those which contain remainder data. */
 	uint64_t bc = (r == 0 ? 0 : r + vdc->vdc_nparity);
 	ASSERT3U(bc, <, groupwidth);
 
 	/* The total number of data and parity sectors for this I/O. */
 	uint64_t tot = psize + (vdc->vdc_nparity * (q + (r == 0 ? 0 : 1)));
 
 	ASSERT3U(vdc->vdc_nparity, >, 0);
 
 	raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth, zio);
 	rr->rr_bigcols = bc;
 	rr->rr_firstdatacol = vdc->vdc_nparity;
 #ifdef ZFS_DEBUG
 	rr->rr_offset = io_offset;
 	rr->rr_size = io_size;
 #endif
 	*rrp = rr;
 
 	uint8_t *base;
 	uint64_t iter, asize = 0;
 	vdev_draid_get_perm(vdc, perm, &base, &iter);
 	for (uint64_t i = 0; i < groupwidth; i++) {
 		raidz_col_t *rc = &rr->rr_col[i];
 		uint64_t c = (groupstart + i) % ndisks;
 
 		/* increment the offset if we wrap to the next row */
 		if (i == wrap)
 			physical_offset += VDEV_DRAID_ROWHEIGHT;
 
 		rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c);
 		rc->rc_offset = physical_offset;
 
 		if (q == 0 && i >= bc)
 			rc->rc_size = 0;
 		else if (i < bc)
 			rc->rc_size = (q + 1) << ashift;
 		else
 			rc->rc_size = q << ashift;
 
 		asize += rc->rc_size;
 	}
 
 	ASSERT3U(asize, ==, tot << ashift);
 	rr->rr_nempty = roundup(tot, groupwidth) - tot;
 	IMPLY(bc > 0, rr->rr_nempty == groupwidth - bc);
 
 	/* Allocate buffers for the parity columns */
 	for (uint64_t c = 0; c < rr->rr_firstdatacol; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
 	}
 
 	/*
 	 * Map buffers for data columns and allocate/map buffers for skip
 	 * sectors.  There are three distinct cases for dRAID which are
 	 * required to support sequential rebuild.
 	 */
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		vdev_draid_map_alloc_write(zio, abd_offset, rr);
 	} else if ((rr->rr_nempty > 0) &&
 	    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
 		vdev_draid_map_alloc_scrub(zio, abd_offset, rr);
 	} else {
 		ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 		vdev_draid_map_alloc_read(zio, abd_offset, rr);
 	}
 
 	return (io_size);
 }
 
 /*
  * Allocate the raidz mapping to be applied to the dRAID I/O.  The parity
  * calculations for dRAID are identical to raidz however there are a few
  * differences in the layout.
  *
  * - dRAID always allocates a full stripe width. Any extra sectors due
  *   this padding are zero filled and written to disk. They will be read
  *   back during a scrub or repair operation since they are included in
  *   the parity calculation. This property enables sequential resilvering.
  *
  * - When the block at the logical offset spans redundancy groups then two
  *   rows are allocated in the raidz_map_t. One row resides at the end of
  *   the first group and the other at the start of the following group.
  */
 static raidz_map_t *
 vdev_draid_map_alloc(zio_t *zio)
 {
 	raidz_row_t *rr[2];
 	uint64_t abd_offset = 0;
 	uint64_t abd_size = zio->io_size;
 	uint64_t io_offset = zio->io_offset;
 	uint64_t size;
 	int nrows = 1;
 
 	size = vdev_draid_map_alloc_row(zio, &rr[0], io_offset,
 	    abd_offset, abd_size);
 	if (size < abd_size) {
 		vdev_t *vd = zio->io_vd;
 
 		io_offset += vdev_draid_asize(vd, size, 0);
 		abd_offset += size;
 		abd_size -= size;
 		nrows++;
 
 		ASSERT3U(io_offset, ==, vdev_draid_group_to_offset(
 		    vd, vdev_draid_offset_to_group(vd, io_offset)));
 		ASSERT3U(abd_offset, <, zio->io_size);
 		ASSERT3U(abd_size, !=, 0);
 
 		size = vdev_draid_map_alloc_row(zio, &rr[1],
 		    io_offset, abd_offset, abd_size);
 		VERIFY3U(size, ==, abd_size);
 	}
 
 	raidz_map_t *rm;
 	rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[nrows]), KM_SLEEP);
 	rm->rm_ops = vdev_raidz_math_get_ops();
 	rm->rm_nrows = nrows;
 	rm->rm_row[0] = rr[0];
 	if (nrows == 2)
 		rm->rm_row[1] = rr[1];
 	return (rm);
 }
 
 /*
  * Given an offset into a dRAID return the next group width aligned offset
  * which can be used to start an allocation.
  */
 static uint64_t
 vdev_draid_get_astart(vdev_t *vd, const uint64_t start)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	return (roundup(start, vdc->vdc_groupwidth << vd->vdev_ashift));
 }
 
 /*
  * Allocatable space for dRAID is (children - nspares) * sizeof(smallest child)
  * rounded down to the last full slice.  So each child must provide at least
  * 1 / (children - nspares) of its asize.
  */
 static uint64_t
 vdev_draid_min_asize(vdev_t *vd)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	return (VDEV_DRAID_REFLOW_RESERVE +
 	    (vd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks));
 }
 
 /*
  * When using dRAID the minimum allocation size is determined by the number
  * of data disks in the redundancy group.  Full stripes are always used.
  */
 static uint64_t
 vdev_draid_min_alloc(vdev_t *vd)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	return (vdc->vdc_ndata << vd->vdev_ashift);
 }
 
 /*
  * Returns true if the txg range does not exist on any leaf vdev.
  *
  * A dRAID spare does not fit into the DTL model. While it has child vdevs
  * there is no redundancy among them, and the effective child vdev is
  * determined by offset. Essentially we do a vdev_dtl_reassess() on the
  * fly by replacing a dRAID spare with the child vdev under the offset.
  * Note that it is a recursive process because the child vdev can be
  * another dRAID spare and so on.
  */
 boolean_t
 vdev_draid_missing(vdev_t *vd, uint64_t physical_offset, uint64_t txg,
     uint64_t size)
 {
 	if (vd->vdev_ops == &vdev_spare_ops ||
 	    vd->vdev_ops == &vdev_replacing_ops) {
 		/*
 		 * Check all of the readable children, if any child
 		 * contains the txg range the data it is not missing.
 		 */
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 
 			if (!vdev_readable(cvd))
 				continue;
 
 			if (!vdev_draid_missing(cvd, physical_offset,
 			    txg, size))
 				return (B_FALSE);
 		}
 
 		return (B_TRUE);
 	}
 
 	if (vd->vdev_ops == &vdev_draid_spare_ops) {
 		/*
 		 * When sequentially resilvering we don't have a proper
 		 * txg range so instead we must presume all txgs are
 		 * missing on this vdev until the resilver completes.
 		 */
 		if (vd->vdev_rebuild_txg != 0)
 			return (B_TRUE);
 
 		/*
 		 * DTL_MISSING is set for all prior txgs when a resilver
 		 * is started in spa_vdev_attach().
 		 */
 		if (vdev_dtl_contains(vd, DTL_MISSING, txg, size))
 			return (B_TRUE);
 
 		/*
 		 * Consult the DTL on the relevant vdev. Either a vdev
 		 * leaf or spare/replace mirror child may be returned so
 		 * we must recursively call vdev_draid_missing_impl().
 		 */
 		vd = vdev_draid_spare_get_child(vd, physical_offset);
 		if (vd == NULL)
 			return (B_TRUE);
 
 		return (vdev_draid_missing(vd, physical_offset,
 		    txg, size));
 	}
 
 	return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
 }
 
 /*
  * Returns true if the txg is only partially replicated on the leaf vdevs.
  */
 static boolean_t
 vdev_draid_partial(vdev_t *vd, uint64_t physical_offset, uint64_t txg,
     uint64_t size)
 {
 	if (vd->vdev_ops == &vdev_spare_ops ||
 	    vd->vdev_ops == &vdev_replacing_ops) {
 		/*
 		 * Check all of the readable children, if any child is
 		 * missing the txg range then it is partially replicated.
 		 */
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 
 			if (!vdev_readable(cvd))
 				continue;
 
 			if (vdev_draid_partial(cvd, physical_offset, txg, size))
 				return (B_TRUE);
 		}
 
 		return (B_FALSE);
 	}
 
 	if (vd->vdev_ops == &vdev_draid_spare_ops) {
 		/*
 		 * When sequentially resilvering we don't have a proper
 		 * txg range so instead we must presume all txgs are
 		 * missing on this vdev until the resilver completes.
 		 */
 		if (vd->vdev_rebuild_txg != 0)
 			return (B_TRUE);
 
 		/*
 		 * DTL_MISSING is set for all prior txgs when a resilver
 		 * is started in spa_vdev_attach().
 		 */
 		if (vdev_dtl_contains(vd, DTL_MISSING, txg, size))
 			return (B_TRUE);
 
 		/*
 		 * Consult the DTL on the relevant vdev. Either a vdev
 		 * leaf or spare/replace mirror child may be returned so
 		 * we must recursively call vdev_draid_missing_impl().
 		 */
 		vd = vdev_draid_spare_get_child(vd, physical_offset);
 		if (vd == NULL)
 			return (B_TRUE);
 
 		return (vdev_draid_partial(vd, physical_offset, txg, size));
 	}
 
 	return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
 }
 
 /*
  * Determine if the vdev is readable at the given offset.
  */
 boolean_t
 vdev_draid_readable(vdev_t *vd, uint64_t physical_offset)
 {
 	if (vd->vdev_ops == &vdev_draid_spare_ops) {
 		vd = vdev_draid_spare_get_child(vd, physical_offset);
 		if (vd == NULL)
 			return (B_FALSE);
 	}
 
 	if (vd->vdev_ops == &vdev_spare_ops ||
 	    vd->vdev_ops == &vdev_replacing_ops) {
 
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 
 			if (!vdev_readable(cvd))
 				continue;
 
 			if (vdev_draid_readable(cvd, physical_offset))
 				return (B_TRUE);
 		}
 
 		return (B_FALSE);
 	}
 
 	return (vdev_readable(vd));
 }
 
 /*
  * Returns the first distributed spare found under the provided vdev tree.
  */
 static vdev_t *
 vdev_draid_find_spare(vdev_t *vd)
 {
 	if (vd->vdev_ops == &vdev_draid_spare_ops)
 		return (vd);
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *svd = vdev_draid_find_spare(vd->vdev_child[c]);
 		if (svd != NULL)
 			return (svd);
 	}
 
 	return (NULL);
 }
 
 /*
  * Returns B_TRUE if the passed in vdev is currently "faulted".
  * Faulted, in this context, means that the vdev represents a
  * replacing or sparing vdev tree.
  */
 static boolean_t
 vdev_draid_faulted(vdev_t *vd, uint64_t physical_offset)
 {
 	if (vd->vdev_ops == &vdev_draid_spare_ops) {
 		vd = vdev_draid_spare_get_child(vd, physical_offset);
 		if (vd == NULL)
 			return (B_FALSE);
 
 		/*
 		 * After resolving the distributed spare to a leaf vdev
 		 * check the parent to determine if it's "faulted".
 		 */
 		vd = vd->vdev_parent;
 	}
 
 	return (vd->vdev_ops == &vdev_replacing_ops ||
 	    vd->vdev_ops == &vdev_spare_ops);
 }
 
 /*
  * Determine if the dRAID block at the logical offset is degraded.
  * Used by sequential resilver.
  */
 static boolean_t
 vdev_draid_group_degraded(vdev_t *vd, uint64_t offset)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 	ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset);
 
 	uint64_t groupstart, perm;
 	uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
 	    offset, &perm, &groupstart);
 
 	uint8_t *base;
 	uint64_t iter;
 	vdev_draid_get_perm(vdc, perm, &base, &iter);
 
 	for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
 		uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
 		uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c);
 		vdev_t *cvd = vd->vdev_child[cid];
 
 		/* Group contains a faulted vdev. */
 		if (vdev_draid_faulted(cvd, physical_offset))
 			return (B_TRUE);
 
 		/*
 		 * Always check groups with active distributed spares
 		 * because any vdev failure in the pool will affect them.
 		 */
 		if (vdev_draid_find_spare(cvd) != NULL)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Determine if the txg is missing.  Used by healing resilver.
  */
 static boolean_t
 vdev_draid_group_missing(vdev_t *vd, uint64_t offset, uint64_t txg,
     uint64_t size)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 	ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset);
 
 	uint64_t groupstart, perm;
 	uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
 	    offset, &perm, &groupstart);
 
 	uint8_t *base;
 	uint64_t iter;
 	vdev_draid_get_perm(vdc, perm, &base, &iter);
 
 	for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
 		uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
 		uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c);
 		vdev_t *cvd = vd->vdev_child[cid];
 
 		/* Transaction group is known to be partially replicated. */
 		if (vdev_draid_partial(cvd, physical_offset, txg, size))
 			return (B_TRUE);
 
 		/*
 		 * Always check groups with active distributed spares
 		 * because any vdev failure in the pool will affect them.
 		 */
 		if (vdev_draid_find_spare(cvd) != NULL)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Find the smallest child asize and largest sector size to calculate the
  * available capacity.  Distributed spares are ignored since their capacity
  * is also based of the minimum child size in the top-level dRAID.
  */
 static void
 vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep,
     uint64_t *logical_ashiftp, uint64_t *physical_ashiftp)
 {
 	uint64_t logical_ashift = 0, physical_ashift = 0;
 	uint64_t asize = 0, max_asize = 0;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_ops == &vdev_draid_spare_ops)
 			continue;
 
 		asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1;
 		max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		logical_ashift = MAX(logical_ashift, cvd->vdev_ashift);
 	}
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_ops == &vdev_draid_spare_ops)
 			continue;
 		physical_ashift = vdev_best_ashift(logical_ashift,
 		    physical_ashift, cvd->vdev_physical_ashift);
 	}
 
 	*asizep = asize;
 	*max_asizep = max_asize;
 	*logical_ashiftp = logical_ashift;
 	*physical_ashiftp = physical_ashift;
 }
 
 /*
  * Open spare vdevs.
  */
 static boolean_t
 vdev_draid_open_spares(vdev_t *vd)
 {
 	return (vd->vdev_ops == &vdev_draid_spare_ops ||
 	    vd->vdev_ops == &vdev_replacing_ops ||
 	    vd->vdev_ops == &vdev_spare_ops);
 }
 
 /*
  * Open all children, excluding spares.
  */
 static boolean_t
 vdev_draid_open_children(vdev_t *vd)
 {
 	return (!vdev_draid_open_spares(vd));
 }
 
 /*
  * Open a top-level dRAID vdev.
  */
 static int
 vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_draid_config_t *vdc =  vd->vdev_tsd;
 	uint64_t nparity = vdc->vdc_nparity;
 	int open_errors = 0;
 
 	if (nparity > VDEV_DRAID_MAXPARITY ||
 	    vd->vdev_children < nparity + 1) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * First open the normal children then the distributed spares.  This
 	 * ordering is important to ensure the distributed spares calculate
 	 * the correct psize in the event that the dRAID vdevs were expanded.
 	 */
 	vdev_open_children_subset(vd, vdev_draid_open_children);
 	vdev_open_children_subset(vd, vdev_draid_open_spares);
 
 	/* Verify enough of the children are available to continue. */
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c]->vdev_open_error != 0) {
 			if ((++open_errors) > nparity) {
 				vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 				return (SET_ERROR(ENXIO));
 			}
 		}
 	}
 
 	/*
 	 * Allocatable capacity is the sum of the space on all children less
 	 * the number of distributed spares rounded down to last full row
 	 * and then to the last full group. An additional 32MB of scratch
 	 * space is reserved at the end of each child for use by the dRAID
 	 * expansion feature.
 	 */
 	uint64_t child_asize, child_max_asize;
 	vdev_draid_calculate_asize(vd, &child_asize, &child_max_asize,
 	    logical_ashift, physical_ashift);
 
 	/*
 	 * Should be unreachable since the minimum child size is 64MB, but
 	 * we want to make sure an underflow absolutely cannot occur here.
 	 */
 	if (child_asize < VDEV_DRAID_REFLOW_RESERVE ||
 	    child_max_asize < VDEV_DRAID_REFLOW_RESERVE) {
 		return (SET_ERROR(ENXIO));
 	}
 
 	child_asize = ((child_asize - VDEV_DRAID_REFLOW_RESERVE) /
 	    VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT;
 	child_max_asize = ((child_max_asize - VDEV_DRAID_REFLOW_RESERVE) /
 	    VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT;
 
 	*asize = (((child_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) *
 	    vdc->vdc_groupsz);
 	*max_asize = (((child_max_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) *
 	    vdc->vdc_groupsz);
 
 	return (0);
 }
 
 /*
  * Close a top-level dRAID vdev.
  */
 static void
 vdev_draid_close(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c] != NULL)
 			vdev_close(vd->vdev_child[c]);
 	}
 }
 
 /*
  * Return the maximum asize for a rebuild zio in the provided range
  * given the following constraints.  A dRAID chunks may not:
  *
  * - Exceed the maximum allowed block size (SPA_MAXBLOCKSIZE), or
  * - Span dRAID redundancy groups.
  */
 static uint64_t
 vdev_draid_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
     uint64_t max_segment)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	uint64_t ashift = vd->vdev_ashift;
 	uint64_t ndata = vdc->vdc_ndata;
 	uint64_t psize = MIN(P2ROUNDUP(max_segment * ndata, 1 << ashift),
 	    SPA_MAXBLOCKSIZE);
 
 	ASSERT3U(vdev_draid_get_astart(vd, start), ==, start);
 	ASSERT3U(asize % (vdc->vdc_groupwidth << ashift), ==, 0);
 
 	/* Chunks must evenly span all data columns in the group. */
 	psize = (((psize >> ashift) / ndata) * ndata) << ashift;
 	uint64_t chunk_size = MIN(asize, vdev_psize_to_asize(vd, psize));
 
 	/* Reduce the chunk size to the group space remaining. */
 	uint64_t group = vdev_draid_offset_to_group(vd, start);
 	uint64_t left = vdev_draid_group_to_offset(vd, group + 1) - start;
 	chunk_size = MIN(chunk_size, left);
 
 	ASSERT3U(chunk_size % (vdc->vdc_groupwidth << ashift), ==, 0);
 	ASSERT3U(vdev_draid_offset_to_group(vd, start), ==,
 	    vdev_draid_offset_to_group(vd, start + chunk_size - 1));
 
 	return (chunk_size);
 }
 
 /*
  * Align the start of the metaslab to the group width and slightly reduce
  * its size to a multiple of the group width.  Since full stripe writes are
  * required by dRAID this space is unallocable.  Furthermore, aligning the
  * metaslab start is important for vdev initialize and TRIM which both operate
  * on metaslab boundaries which vdev_xlate() expects to be aligned.
  */
 static void
 vdev_draid_metaslab_init(vdev_t *vd, uint64_t *ms_start, uint64_t *ms_size)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	uint64_t sz = vdc->vdc_groupwidth << vd->vdev_ashift;
 	uint64_t astart = vdev_draid_get_astart(vd, *ms_start);
 	uint64_t asize = ((*ms_size - (astart - *ms_start)) / sz) * sz;
 
 	*ms_start = astart;
 	*ms_size = asize;
 
 	ASSERT0(*ms_start % sz);
 	ASSERT0(*ms_size % sz);
 }
 
 /*
  * Add virtual dRAID spares to the list of valid spares. In order to accomplish
  * this the existing array must be freed and reallocated with the additional
  * entries.
  */
 int
 vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp,
     uint64_t next_vdev_id)
 {
 	uint64_t draid_nspares = 0;
 	uint64_t ndraid = 0;
 	int error;
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		vdev_t *cvd = vd->vdev_child[i];
 
 		if (cvd->vdev_ops == &vdev_draid_ops) {
 			vdev_draid_config_t *vdc = cvd->vdev_tsd;
 			draid_nspares += vdc->vdc_nspares;
 			ndraid++;
 		}
 	}
 
 	if (draid_nspares == 0) {
 		*ndraidp = ndraid;
 		return (0);
 	}
 
 	nvlist_t **old_spares, **new_spares;
 	uint_t old_nspares;
 	error = nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &old_spares, &old_nspares);
 	if (error)
 		old_nspares = 0;
 
 	/* Allocate memory and copy of the existing spares. */
 	new_spares = kmem_alloc(sizeof (nvlist_t *) *
 	    (draid_nspares + old_nspares), KM_SLEEP);
 	for (uint_t i = 0; i < old_nspares; i++)
 		new_spares[i] = fnvlist_dup(old_spares[i]);
 
 	/* Add new distributed spares to ZPOOL_CONFIG_SPARES. */
 	uint64_t n = old_nspares;
 	for (uint64_t vdev_id = 0; vdev_id < vd->vdev_children; vdev_id++) {
 		vdev_t *cvd = vd->vdev_child[vdev_id];
 		char path[64];
 
 		if (cvd->vdev_ops != &vdev_draid_ops)
 			continue;
 
 		vdev_draid_config_t *vdc = cvd->vdev_tsd;
 		uint64_t nspares = vdc->vdc_nspares;
 		uint64_t nparity = vdc->vdc_nparity;
 
 		for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) {
 			memset(path, 0, sizeof (path));
 			(void) snprintf(path, sizeof (path) - 1,
 			    "%s%llu-%llu-%llu", VDEV_TYPE_DRAID,
 			    (u_longlong_t)nparity,
 			    (u_longlong_t)next_vdev_id + vdev_id,
 			    (u_longlong_t)spare_id);
 
 			nvlist_t *spare = fnvlist_alloc();
 			fnvlist_add_string(spare, ZPOOL_CONFIG_PATH, path);
 			fnvlist_add_string(spare, ZPOOL_CONFIG_TYPE,
 			    VDEV_TYPE_DRAID_SPARE);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_TOP_GUID,
 			    cvd->vdev_guid);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_SPARE_ID,
 			    spare_id);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_LOG, 0);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_SPARE, 1);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_WHOLE_DISK, 1);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_ASHIFT,
 			    cvd->vdev_ashift);
 
 			new_spares[n] = spare;
 			n++;
 		}
 	}
 
 	if (n > 0) {
 		(void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES);
 		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 		    (const nvlist_t **)new_spares, n);
 	}
 
 	for (int i = 0; i < n; i++)
 		nvlist_free(new_spares[i]);
 
 	kmem_free(new_spares, sizeof (*new_spares) * n);
 	*ndraidp = ndraid;
 
 	return (0);
 }
 
 /*
  * Determine if any portion of the provided block resides on a child vdev
  * with a dirty DTL and therefore needs to be resilvered.
  */
 static boolean_t
 vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t asize = vdev_draid_asize(vd, psize, 0);
 
 	if (phys_birth == TXG_UNKNOWN) {
 		/*
 		 * Sequential resilver.  There is no meaningful phys_birth
 		 * for this block, we can only determine if block resides
 		 * in a degraded group in which case it must be resilvered.
 		 */
 		ASSERT3U(vdev_draid_offset_to_group(vd, offset), ==,
 		    vdev_draid_offset_to_group(vd, offset + asize - 1));
 
 		return (vdev_draid_group_degraded(vd, offset));
 	} else {
 		/*
 		 * Healing resilver.  TXGs not in DTL_PARTIAL are intact,
 		 * as are blocks in non-degraded groups.
 		 */
 		if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
 			return (B_FALSE);
 
 		if (vdev_draid_group_missing(vd, offset, phys_birth, 1))
 			return (B_TRUE);
 
 		/* The block may span groups in which case check both. */
 		if (vdev_draid_offset_to_group(vd, offset) !=
 		    vdev_draid_offset_to_group(vd, offset + asize - 1)) {
 			if (vdev_draid_group_missing(vd,
 			    offset + asize, phys_birth, 1))
 				return (B_TRUE);
 		}
 
 		return (B_FALSE);
 	}
 }
 
 static boolean_t
 vdev_draid_rebuilding(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
 		return (B_TRUE);
 
 	for (int i = 0; i < vd->vdev_children; i++) {
 		if (vdev_draid_rebuilding(vd->vdev_child[i])) {
 			return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static void
 vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
 {
 #ifdef ZFS_DEBUG
-	range_seg64_t logical_rs, physical_rs, remain_rs;
+	zfs_range_seg64_t logical_rs, physical_rs, remain_rs;
 	logical_rs.rs_start = rr->rr_offset;
 	logical_rs.rs_end = logical_rs.rs_start +
 	    vdev_draid_asize(vd, rr->rr_size, 0);
 
 	raidz_col_t *rc = &rr->rr_col[col];
 	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
 	ASSERT(vdev_xlate_is_empty(&remain_rs));
 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
 	ASSERT3U(rc->rc_offset + rc->rc_size, ==, physical_rs.rs_end);
 #endif
 }
 
 /*
  * For write operations:
  * 1. Generate the parity data
  * 2. Create child zio write operations to each column's vdev, for both
  *    data and parity.  A gang ABD is allocated by vdev_draid_map_alloc()
  *    if a skip sector needs to be added to a column.
  */
 static void
 vdev_draid_io_start_write(zio_t *zio, raidz_row_t *rr)
 {
 	vdev_t *vd = zio->io_vd;
 	raidz_map_t *rm = zio->io_vsd;
 
 	vdev_raidz_generate_parity_row(rm, rr);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		/*
 		 * Empty columns are zero filled and included in the parity
 		 * calculation and therefore must be written.
 		 */
 		ASSERT3U(rc->rc_size, !=, 0);
 
 		/* Verify physical to logical translation */
 		vdev_draid_io_verify(vd, rr, c);
 
 		zio_nowait(zio_vdev_child_io(zio, NULL,
 		    vd->vdev_child[rc->rc_devidx], rc->rc_offset,
 		    rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority,
 		    0, vdev_raidz_child_done, rc));
 	}
 }
 
 /*
  * For read operations:
  * 1. The vdev_draid_map_alloc() function will create a minimal raidz
  *    mapping for the read based on the zio->io_flags.  There are two
  *    possible mappings either 1) a normal read, or 2) a scrub/resilver.
  * 2. Create the zio read operations.  This will include all parity
  *    columns and skip sectors for a scrub/resilver.
  */
 static void
 vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
 {
 	vdev_t *vd = zio->io_vd;
 
 	/* Sequential rebuild must do IO at redundancy group boundary. */
 	IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0);
 
 	/*
 	 * Iterate over the columns in reverse order so that we hit the parity
 	 * last.  Any errors along the way will force us to read the parity.
 	 * For scrub/resilver IOs which verify skip sectors, a gang ABD will
 	 * have been allocated to store them and rc->rc_size is increased.
 	 */
 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 		if (!vdev_draid_readable(cvd, rc->rc_offset)) {
 			if (c >= rr->rr_firstdatacol)
 				rr->rr_missingdata++;
 			else
 				rr->rr_missingparity++;
 			rc->rc_error = SET_ERROR(ENXIO);
 			rc->rc_tried = 1;
 			rc->rc_skipped = 1;
 			continue;
 		}
 
 		if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) {
 			if (c >= rr->rr_firstdatacol)
 				rr->rr_missingdata++;
 			else
 				rr->rr_missingparity++;
 			rc->rc_error = SET_ERROR(ESTALE);
 			rc->rc_skipped = 1;
 			continue;
 		}
 
 		/*
 		 * Empty columns may be read during vdev_draid_io_done().
 		 * Only skip them after the readable and missing checks
 		 * verify they are available.
 		 */
 		if (rc->rc_size == 0) {
 			rc->rc_skipped = 1;
 			continue;
 		}
 
 		if (zio->io_flags & ZIO_FLAG_RESILVER) {
 			vdev_t *svd;
 
 			/*
 			 * Sequential rebuilds need to always consider the data
 			 * on the child being rebuilt to be stale.  This is
 			 * important when all columns are available to aid
 			 * known reconstruction in identifing which columns
 			 * contain incorrect data.
 			 *
 			 * Furthermore, all repairs need to be constrained to
 			 * the devices being rebuilt because without a checksum
 			 * we cannot verify the data is actually correct and
 			 * performing an incorrect repair could result in
 			 * locking in damage and making the data unrecoverable.
 			 */
 			if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
 				if (vdev_draid_rebuilding(cvd)) {
 					if (c >= rr->rr_firstdatacol)
 						rr->rr_missingdata++;
 					else
 						rr->rr_missingparity++;
 					rc->rc_error = SET_ERROR(ESTALE);
 					rc->rc_skipped = 1;
 					rc->rc_allow_repair = 1;
 					continue;
 				} else {
 					rc->rc_allow_repair = 0;
 				}
 			} else {
 				rc->rc_allow_repair = 1;
 			}
 
 			/*
 			 * If this child is a distributed spare then the
 			 * offset might reside on the vdev being replaced.
 			 * In which case this data must be written to the
 			 * new device.  Failure to do so would result in
 			 * checksum errors when the old device is detached
 			 * and the pool is scrubbed.
 			 */
 			if ((svd = vdev_draid_find_spare(cvd)) != NULL) {
 				svd = vdev_draid_spare_get_child(svd,
 				    rc->rc_offset);
 				if (svd && (svd->vdev_ops == &vdev_spare_ops ||
 				    svd->vdev_ops == &vdev_replacing_ops)) {
 					rc->rc_force_repair = 1;
 
 					if (vdev_draid_rebuilding(svd))
 						rc->rc_allow_repair = 1;
 				}
 			}
 
 			/*
 			 * Always issue a repair IO to this child when its
 			 * a spare or replacing vdev with an active rebuild.
 			 */
 			if ((cvd->vdev_ops == &vdev_spare_ops ||
 			    cvd->vdev_ops == &vdev_replacing_ops) &&
 			    vdev_draid_rebuilding(cvd)) {
 				rc->rc_force_repair = 1;
 				rc->rc_allow_repair = 1;
 			}
 		}
 	}
 
 	/*
 	 * Either a parity or data column is missing this means a repair
 	 * may be attempted by vdev_draid_io_done().  Expand the raid map
 	 * to read in empty columns which are needed along with the parity
 	 * during reconstruction.
 	 */
 	if ((rr->rr_missingdata > 0 || rr->rr_missingparity > 0) &&
 	    rr->rr_nempty > 0 && rr->rr_abd_empty == NULL) {
 		vdev_draid_map_alloc_empty(zio, rr);
 	}
 
 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 		if (rc->rc_error || rc->rc_size == 0)
 			continue;
 
 		if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
 	}
 }
 
 /*
  * Start an IO operation to a dRAID vdev.
  */
 static void
 vdev_draid_io_start(zio_t *zio)
 {
 	vdev_t *vd __maybe_unused = zio->io_vd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 	ASSERT3U(zio->io_offset, ==, vdev_draid_get_astart(vd, zio->io_offset));
 
 	raidz_map_t *rm = vdev_draid_map_alloc(zio);
 	zio->io_vsd = rm;
 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			vdev_draid_io_start_write(zio, rm->rm_row[i]);
 		}
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_READ);
 
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			vdev_draid_io_start_read(zio, rm->rm_row[i]);
 		}
 	}
 
 	zio_execute(zio);
 }
 
 /*
  * Complete an IO operation on a dRAID vdev.  The raidz logic can be applied
  * to dRAID since the layout is fully described by the raidz_map_t.
  */
 static void
 vdev_draid_io_done(zio_t *zio)
 {
 	vdev_raidz_io_done(zio);
 }
 
 static void
 vdev_draid_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 	ASSERT(vd->vdev_ops == &vdev_draid_ops);
 
 	if (faulted > vdc->vdc_nparity)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_NO_REPLICAS);
 	else if (degraded + faulted != 0)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	else
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
 static void
-vdev_draid_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
-    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
+vdev_draid_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs,
+    zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
 {
 	vdev_t *raidvd = cvd->vdev_parent;
 	ASSERT(raidvd->vdev_ops == &vdev_draid_ops);
 
 	vdev_draid_config_t *vdc = raidvd->vdev_tsd;
 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
 
 	/* Make sure the offsets are block-aligned */
 	ASSERT0(logical_rs->rs_start % (1 << ashift));
 	ASSERT0(logical_rs->rs_end % (1 << ashift));
 
 	uint64_t logical_start = logical_rs->rs_start;
 	uint64_t logical_end = logical_rs->rs_end;
 
 	/*
 	 * Unaligned ranges must be skipped. All metaslabs are correctly
 	 * aligned so this should not happen, but this case is handled in
 	 * case it's needed by future callers.
 	 */
 	uint64_t astart = vdev_draid_get_astart(raidvd, logical_start);
 	if (astart != logical_start) {
 		physical_rs->rs_start = logical_start;
 		physical_rs->rs_end = logical_start;
 		remain_rs->rs_start = MIN(astart, logical_end);
 		remain_rs->rs_end = logical_end;
 		return;
 	}
 
 	/*
 	 * Unlike with mirrors and raidz a dRAID logical range can map
 	 * to multiple non-contiguous physical ranges. This is handled by
 	 * limiting the size of the logical range to a single group and
 	 * setting the remain argument such that it describes the remaining
 	 * unmapped logical range. This is stricter than absolutely
 	 * necessary but helps simplify the logic below.
 	 */
 	uint64_t group = vdev_draid_offset_to_group(raidvd, logical_start);
 	uint64_t nextstart = vdev_draid_group_to_offset(raidvd, group + 1);
 	if (logical_end > nextstart)
 		logical_end = nextstart;
 
 	/* Find the starting offset for each vdev in the group */
 	uint64_t perm, groupstart;
 	uint64_t start = vdev_draid_logical_to_physical(raidvd,
 	    logical_start, &perm, &groupstart);
 	uint64_t end = start;
 
 	uint8_t *base;
 	uint64_t iter, id;
 	vdev_draid_get_perm(vdc, perm, &base, &iter);
 
 	/*
 	 * Check if the passed child falls within the group.  If it does
 	 * update the start and end to reflect the physical range.
 	 * Otherwise, leave them unmodified which will result in an empty
 	 * (zero-length) physical range being returned.
 	 */
 	for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
 		uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
 
 		if (c == 0 && i != 0) {
 			/* the group wrapped, increment the start */
 			start += VDEV_DRAID_ROWHEIGHT;
 			end = start;
 		}
 
 		id = vdev_draid_permute_id(vdc, base, iter, c);
 		if (id == cvd->vdev_id) {
 			uint64_t b_size = (logical_end >> ashift) -
 			    (logical_start >> ashift);
 			ASSERT3U(b_size, >, 0);
 			end = start + ((((b_size - 1) /
 			    vdc->vdc_groupwidth) + 1) << ashift);
 			break;
 		}
 	}
 	physical_rs->rs_start = start;
 	physical_rs->rs_end = end;
 
 	/*
 	 * Only top-level vdevs are allowed to set remain_rs because
 	 * when .vdev_op_xlate() is called for their children the full
 	 * logical range is not provided by vdev_xlate().
 	 */
 	remain_rs->rs_start = logical_end;
 	remain_rs->rs_end = logical_rs->rs_end;
 
 	ASSERT3U(physical_rs->rs_start, <=, logical_start);
 	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
 	    logical_end - logical_start);
 }
 
 /*
  * Add dRAID specific fields to the config nvlist.
  */
 static void
 vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv)
 {
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdc->vdc_nparity);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, vdc->vdc_ndata);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, vdc->vdc_nspares);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, vdc->vdc_ngroups);
 }
 
 /*
  * Initialize private dRAID specific fields from the nvlist.
  */
 static int
 vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd)
 {
 	(void) spa;
 	uint64_t ndata, nparity, nspares, ngroups;
 	int error;
 
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, &ndata))
 		return (SET_ERROR(EINVAL));
 
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) ||
 	    nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	uint_t children;
 	nvlist_t **child;
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0 || children == 0 ||
 	    children > VDEV_DRAID_MAX_CHILDREN) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, &nspares) ||
 	    nspares > 100 || nspares > (children - (ndata + nparity))) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, &ngroups) ||
 	    ngroups == 0 || ngroups > VDEV_DRAID_MAX_CHILDREN) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Validate the minimum number of children exist per group for the
 	 * specified parity level (draid1 >= 2, draid2 >= 3, draid3 >= 4).
 	 */
 	if (children < (ndata + nparity + nspares))
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Create the dRAID configuration using the pool nvlist configuration
 	 * and the fixed mapping for the correct number of children.
 	 */
 	vdev_draid_config_t *vdc;
 	const draid_map_t *map;
 
 	error = vdev_draid_lookup_map(children, &map);
 	if (error)
 		return (SET_ERROR(EINVAL));
 
 	vdc = kmem_zalloc(sizeof (*vdc), KM_SLEEP);
 	vdc->vdc_ndata = ndata;
 	vdc->vdc_nparity = nparity;
 	vdc->vdc_nspares = nspares;
 	vdc->vdc_children = children;
 	vdc->vdc_ngroups = ngroups;
 	vdc->vdc_nperms = map->dm_nperms;
 
 	error = vdev_draid_generate_perms(map, &vdc->vdc_perms);
 	if (error) {
 		kmem_free(vdc, sizeof (*vdc));
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Derived constants.
 	 */
 	vdc->vdc_groupwidth = vdc->vdc_ndata + vdc->vdc_nparity;
 	vdc->vdc_ndisks = vdc->vdc_children - vdc->vdc_nspares;
 	vdc->vdc_groupsz = vdc->vdc_groupwidth * VDEV_DRAID_ROWHEIGHT;
 	vdc->vdc_devslicesz = (vdc->vdc_groupsz * vdc->vdc_ngroups) /
 	    vdc->vdc_ndisks;
 
 	ASSERT3U(vdc->vdc_groupwidth, >=, 2);
 	ASSERT3U(vdc->vdc_groupwidth, <=, vdc->vdc_ndisks);
 	ASSERT3U(vdc->vdc_groupsz, >=, 2 * VDEV_DRAID_ROWHEIGHT);
 	ASSERT3U(vdc->vdc_devslicesz, >=, VDEV_DRAID_ROWHEIGHT);
 	ASSERT3U(vdc->vdc_devslicesz % VDEV_DRAID_ROWHEIGHT, ==, 0);
 	ASSERT3U((vdc->vdc_groupwidth * vdc->vdc_ngroups) %
 	    vdc->vdc_ndisks, ==, 0);
 
 	*tsd = vdc;
 
 	return (0);
 }
 
 static void
 vdev_draid_fini(vdev_t *vd)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	vmem_free(vdc->vdc_perms, sizeof (uint8_t) *
 	    vdc->vdc_children * vdc->vdc_nperms);
 	kmem_free(vdc, sizeof (*vdc));
 }
 
 static uint64_t
 vdev_draid_nparity(vdev_t *vd)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	return (vdc->vdc_nparity);
 }
 
 static uint64_t
 vdev_draid_ndisks(vdev_t *vd)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	return (vdc->vdc_ndisks);
 }
 
 vdev_ops_t vdev_draid_ops = {
 	.vdev_op_init = vdev_draid_init,
 	.vdev_op_fini = vdev_draid_fini,
 	.vdev_op_open = vdev_draid_open,
 	.vdev_op_close = vdev_draid_close,
 	.vdev_op_asize = vdev_draid_asize,
 	.vdev_op_min_asize = vdev_draid_min_asize,
 	.vdev_op_min_alloc = vdev_draid_min_alloc,
 	.vdev_op_io_start = vdev_draid_io_start,
 	.vdev_op_io_done = vdev_draid_io_done,
 	.vdev_op_state_change = vdev_draid_state_change,
 	.vdev_op_need_resilver = vdev_draid_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_draid_xlate,
 	.vdev_op_rebuild_asize = vdev_draid_rebuild_asize,
 	.vdev_op_metaslab_init = vdev_draid_metaslab_init,
 	.vdev_op_config_generate = vdev_draid_config_generate,
 	.vdev_op_nparity = vdev_draid_nparity,
 	.vdev_op_ndisks = vdev_draid_ndisks,
 	.vdev_op_type = VDEV_TYPE_DRAID,
 	.vdev_op_leaf = B_FALSE,
 };
 
 
 /*
  * A dRAID distributed spare is a virtual leaf vdev which is included in the
  * parent dRAID configuration.  The last N columns of the dRAID permutation
  * table are used to determine on which dRAID children a specific offset
  * should be written.  These spare leaf vdevs can only be used to replace
  * faulted children in the same dRAID configuration.
  */
 
 /*
  * Distributed spare state.  All fields are set when the distributed spare is
  * first opened and are immutable.
  */
 typedef struct {
 	vdev_t *vds_draid_vdev;		/* top-level parent dRAID vdev */
 	uint64_t vds_top_guid;		/* top-level parent dRAID guid */
 	uint64_t vds_spare_id;		/* spare id (0 - vdc->vdc_nspares-1) */
 } vdev_draid_spare_t;
 
 /*
  * Returns the parent dRAID vdev to which the distributed spare belongs.
  * This may be safely called even when the vdev is not open.
  */
 vdev_t *
 vdev_draid_spare_get_parent(vdev_t *vd)
 {
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
 
 	if (vds->vds_draid_vdev != NULL)
 		return (vds->vds_draid_vdev);
 
 	return (vdev_lookup_by_guid(vd->vdev_spa->spa_root_vdev,
 	    vds->vds_top_guid));
 }
 
 /*
  * A dRAID space is active when it's the child of a vdev using the
  * vdev_spare_ops, vdev_replacing_ops or vdev_draid_ops.
  */
 static boolean_t
 vdev_draid_spare_is_active(vdev_t *vd)
 {
 	vdev_t *pvd = vd->vdev_parent;
 
 	if (pvd != NULL && (pvd->vdev_ops == &vdev_spare_ops ||
 	    pvd->vdev_ops == &vdev_replacing_ops ||
 	    pvd->vdev_ops == &vdev_draid_ops)) {
 		return (B_TRUE);
 	} else {
 		return (B_FALSE);
 	}
 }
 
 /*
  * Given a dRAID distribute spare vdev, returns the physical child vdev
  * on which the provided offset resides.  This may involve recursing through
  * multiple layers of distributed spares.  Note that offset is relative to
  * this vdev.
  */
 vdev_t *
 vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset)
 {
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
 
 	/* The vdev is closed */
 	if (vds->vds_draid_vdev == NULL)
 		return (NULL);
 
 	vdev_t *tvd = vds->vds_draid_vdev;
 	vdev_draid_config_t *vdc = tvd->vdev_tsd;
 
 	ASSERT3P(tvd->vdev_ops, ==, &vdev_draid_ops);
 	ASSERT3U(vds->vds_spare_id, <, vdc->vdc_nspares);
 
 	uint8_t *base;
 	uint64_t iter;
 	uint64_t perm = physical_offset / vdc->vdc_devslicesz;
 
 	vdev_draid_get_perm(vdc, perm, &base, &iter);
 
 	uint64_t cid = vdev_draid_permute_id(vdc, base, iter,
 	    (tvd->vdev_children - 1) - vds->vds_spare_id);
 	vdev_t *cvd = tvd->vdev_child[cid];
 
 	if (cvd->vdev_ops == &vdev_draid_spare_ops)
 		return (vdev_draid_spare_get_child(cvd, physical_offset));
 
 	return (cvd);
 }
 
 static void
 vdev_draid_spare_close(vdev_t *vd)
 {
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 	vds->vds_draid_vdev = NULL;
 }
 
 /*
  * Opening a dRAID spare device is done by looking up the associated dRAID
  * top-level vdev guid from the spare configuration.
  */
 static int
 vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
 	uint64_t asize, max_asize;
 
 	vdev_t *tvd = vdev_lookup_by_guid(rvd, vds->vds_top_guid);
 	if (tvd == NULL) {
 		/*
 		 * When spa_vdev_add() is labeling new spares the
 		 * associated dRAID is not attached to the root vdev
 		 * nor does this spare have a parent.  Simulate a valid
 		 * device in order to allow the label to be initialized
 		 * and the distributed spare added to the configuration.
 		 */
 		if (vd->vdev_parent == NULL) {
 			*psize = *max_psize = SPA_MINDEVSIZE;
 			*logical_ashift = *physical_ashift = ASHIFT_MIN;
 			return (0);
 		}
 
 		return (SET_ERROR(EINVAL));
 	}
 
 	vdev_draid_config_t *vdc = tvd->vdev_tsd;
 	if (tvd->vdev_ops != &vdev_draid_ops || vdc == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if (vds->vds_spare_id >= vdc->vdc_nspares)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Neither tvd->vdev_asize or tvd->vdev_max_asize can be used here
 	 * because the caller may be vdev_draid_open() in which case the
 	 * values are stale as they haven't yet been updated by vdev_open().
 	 * To avoid this always recalculate the dRAID asize and max_asize.
 	 */
 	vdev_draid_calculate_asize(tvd, &asize, &max_asize,
 	    logical_ashift, physical_ashift);
 
 	*psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
 	*max_psize = max_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
 
 	vds->vds_draid_vdev = tvd;
 
 	return (0);
 }
 
 /*
  * Completed distributed spare IO.  Store the result in the parent zio
  * as if it had performed the operation itself.  Only the first error is
  * preserved if there are multiple errors.
  */
 static void
 vdev_draid_spare_child_done(zio_t *zio)
 {
 	zio_t *pio = zio->io_private;
 
 	/*
 	 * IOs are issued to non-writable vdevs in order to keep their
 	 * DTLs accurate.  However, we don't want to propagate the
 	 * error in to the distributed spare's DTL.  When resilvering
 	 * vdev_draid_need_resilver() will consult the relevant DTL
 	 * to determine if the data is missing and must be repaired.
 	 */
 	if (!vdev_writeable(zio->io_vd))
 		return;
 
 	if (pio->io_error == 0)
 		pio->io_error = zio->io_error;
 }
 
 /*
  * Returns a valid label nvlist for the distributed spare vdev.  This is
  * used to bypass the IO pipeline to avoid the complexity of constructing
  * a complete label with valid checksum to return when read.
  */
 nvlist_t *
 vdev_draid_read_config_spare(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 	uint64_t guid = vd->vdev_guid;
 
 	nvlist_t *nv = fnvlist_alloc();
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_VERSION, spa_version(spa));
 	fnvlist_add_string(nv, ZPOOL_CONFIG_POOL_NAME, spa_name(spa));
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_STATE,
 	    vdev_draid_spare_is_active(vd) ?
 	    POOL_STATE_ACTIVE : POOL_STATE_SPARE);
 
 	/* Set the vdev guid based on the vdev list in sav_count. */
 	for (int i = 0; i < sav->sav_count; i++) {
 		if (sav->sav_vdevs[i]->vdev_ops == &vdev_draid_spare_ops &&
 		    strcmp(sav->sav_vdevs[i]->vdev_path, vd->vdev_path) == 0) {
 			guid = sav->sav_vdevs[i]->vdev_guid;
 			break;
 		}
 	}
 
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, guid);
 
 	return (nv);
 }
 
 /*
  * Handle any flush requested of the distributed spare. All children must be
  * flushed.
  */
 static int
 vdev_draid_spare_flush(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	int error = 0;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		zio_nowait(zio_vdev_child_io(zio, NULL,
 		    vd->vdev_child[c], zio->io_offset, zio->io_abd,
 		    zio->io_size, zio->io_type, zio->io_priority, 0,
 		    vdev_draid_spare_child_done, zio));
 	}
 
 	return (error);
 }
 
 /*
  * Initiate an IO to the distributed spare.  For normal IOs this entails using
  * the zio->io_offset and permutation table to calculate which child dRAID vdev
  * is responsible for the data.  Then passing along the zio to that child to
  * perform the actual IO.  The label ranges are not stored on disk and require
  * some special handling which is described below.
  */
 static void
 vdev_draid_spare_io_start(zio_t *zio)
 {
 	vdev_t *cvd = NULL, *vd = zio->io_vd;
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 	uint64_t offset = zio->io_offset - VDEV_LABEL_START_SIZE;
 
 	/*
 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
 	 * Nothing to be done here but return failure.
 	 */
 	if (vds == NULL) {
 		zio->io_error = ENXIO;
 		zio_interrupt(zio);
 		return;
 	}
 
 	switch (zio->io_type) {
 	case ZIO_TYPE_FLUSH:
 		zio->io_error = vdev_draid_spare_flush(zio);
 		break;
 
 	case ZIO_TYPE_WRITE:
 		if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) {
 			/*
 			 * Accept probe IOs and config writers to simulate the
 			 * existence of an on disk label.  vdev_label_sync(),
 			 * vdev_uberblock_sync() and vdev_copy_uberblocks()
 			 * skip the distributed spares.  This only leaves
 			 * vdev_label_init() which is allowed to succeed to
 			 * avoid adding special cases the function.
 			 */
 			if (zio->io_flags & ZIO_FLAG_PROBE ||
 			    zio->io_flags & ZIO_FLAG_CONFIG_WRITER) {
 				zio->io_error = 0;
 			} else {
 				zio->io_error = SET_ERROR(EIO);
 			}
 		} else {
 			cvd = vdev_draid_spare_get_child(vd, offset);
 
 			if (cvd == NULL) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {
 				zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 				    offset, zio->io_abd, zio->io_size,
 				    zio->io_type, zio->io_priority, 0,
 				    vdev_draid_spare_child_done, zio));
 			}
 		}
 		break;
 
 	case ZIO_TYPE_READ:
 		if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) {
 			/*
 			 * Accept probe IOs to simulate the existence of a
 			 * label.  vdev_label_read_config() bypasses the
 			 * pipeline to read the label configuration and
 			 * vdev_uberblock_load() skips distributed spares
 			 * when attempting to locate the best uberblock.
 			 */
 			if (zio->io_flags & ZIO_FLAG_PROBE) {
 				zio->io_error = 0;
 			} else {
 				zio->io_error = SET_ERROR(EIO);
 			}
 		} else {
 			cvd = vdev_draid_spare_get_child(vd, offset);
 
 			if (cvd == NULL || !vdev_readable(cvd)) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {
 				zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 				    offset, zio->io_abd, zio->io_size,
 				    zio->io_type, zio->io_priority, 0,
 				    vdev_draid_spare_child_done, zio));
 			}
 		}
 		break;
 
 	case ZIO_TYPE_TRIM:
 		/* The vdev label ranges are never trimmed */
 		ASSERT0(VDEV_OFFSET_IS_LABEL(vd, zio->io_offset));
 
 		cvd = vdev_draid_spare_get_child(vd, offset);
 
 		if (cvd == NULL || !cvd->vdev_has_trim) {
 			zio->io_error = SET_ERROR(ENXIO);
 		} else {
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    offset, zio->io_abd, zio->io_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_draid_spare_child_done, zio));
 		}
 		break;
 
 	default:
 		zio->io_error = SET_ERROR(ENOTSUP);
 		break;
 	}
 
 	zio_execute(zio);
 }
 
 static void
 vdev_draid_spare_io_done(zio_t *zio)
 {
 	(void) zio;
 }
 
 /*
  * Lookup the full spare config in spa->spa_spares.sav_config and
  * return the top_guid and spare_id for the named spare.
  */
 static int
 vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp,
     uint64_t *spare_idp)
 {
 	nvlist_t **spares;
 	uint_t nspares;
 	int error;
 
 	if ((spa->spa_spares.sav_config == NULL) ||
 	    (nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)) {
 		return (SET_ERROR(ENOENT));
 	}
 
 	const char *spare_name;
 	error = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &spare_name);
 	if (error != 0)
 		return (SET_ERROR(EINVAL));
 
 	for (int i = 0; i < nspares; i++) {
 		nvlist_t *spare = spares[i];
 		uint64_t top_guid, spare_id;
 		const char *type, *path;
 
 		/* Skip non-distributed spares */
 		error = nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE, &type);
 		if (error != 0 || strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0)
 			continue;
 
 		/* Skip spares with the wrong name */
 		error = nvlist_lookup_string(spare, ZPOOL_CONFIG_PATH, &path);
 		if (error != 0 || strcmp(path, spare_name) != 0)
 			continue;
 
 		/* Found the matching spare */
 		error = nvlist_lookup_uint64(spare,
 		    ZPOOL_CONFIG_TOP_GUID, &top_guid);
 		if (error == 0) {
 			error = nvlist_lookup_uint64(spare,
 			    ZPOOL_CONFIG_SPARE_ID, &spare_id);
 		}
 
 		if (error != 0) {
 			return (SET_ERROR(EINVAL));
 		} else {
 			*top_guidp = top_guid;
 			*spare_idp = spare_id;
 			return (0);
 		}
 	}
 
 	return (SET_ERROR(ENOENT));
 }
 
 /*
  * Initialize private dRAID spare specific fields from the nvlist.
  */
 static int
 vdev_draid_spare_init(spa_t *spa, nvlist_t *nv, void **tsd)
 {
 	vdev_draid_spare_t *vds;
 	uint64_t top_guid = 0;
 	uint64_t spare_id;
 
 	/*
 	 * In the normal case check the list of spares stored in the spa
 	 * to lookup the top_guid and spare_id for provided spare config.
 	 * When creating a new pool or adding vdevs the spare list is not
 	 * yet populated and the values are provided in the passed config.
 	 */
 	if (vdev_draid_spare_lookup(spa, nv, &top_guid, &spare_id) != 0) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_TOP_GUID,
 		    &top_guid) != 0)
 			return (SET_ERROR(EINVAL));
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_SPARE_ID,
 		    &spare_id) != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	vds = kmem_alloc(sizeof (vdev_draid_spare_t), KM_SLEEP);
 	vds->vds_draid_vdev = NULL;
 	vds->vds_top_guid = top_guid;
 	vds->vds_spare_id = spare_id;
 
 	*tsd = vds;
 
 	return (0);
 }
 
 static void
 vdev_draid_spare_fini(vdev_t *vd)
 {
 	kmem_free(vd->vdev_tsd, sizeof (vdev_draid_spare_t));
 }
 
 static void
 vdev_draid_spare_config_generate(vdev_t *vd, nvlist_t *nv)
 {
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
 
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vds->vds_top_guid);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_SPARE_ID, vds->vds_spare_id);
 }
 
 vdev_ops_t vdev_draid_spare_ops = {
 	.vdev_op_init = vdev_draid_spare_init,
 	.vdev_op_fini = vdev_draid_spare_fini,
 	.vdev_op_open = vdev_draid_spare_open,
 	.vdev_op_close = vdev_draid_spare_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_draid_spare_io_start,
 	.vdev_op_io_done = vdev_draid_spare_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = vdev_draid_spare_config_generate,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_DRAID_SPARE,
 	.vdev_op_leaf = B_TRUE,
 };
diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c
index 008e014ecfdc..f6e2662bd40f 100644
--- a/module/zfs/vdev_initialize.c
+++ b/module/zfs/vdev_initialize.c
@@ -1,833 +1,833 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2016, 2024 by Delphix. All rights reserved.
  */
 
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/txg.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zap.h>
 #include <sys/dmu_tx.h>
 #include <sys/vdev_initialize.h>
 
 /*
  * Value that is written to disk during initialization.
  */
 static uint64_t zfs_initialize_value = 0xdeadbeefdeadbeeeULL;
 
 /* maximum number of I/Os outstanding per leaf vdev */
 static const int zfs_initialize_limit = 1;
 
 /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
 static uint64_t zfs_initialize_chunk_size = 1024 * 1024;
 
 static boolean_t
 vdev_initialize_should_stop(vdev_t *vd)
 {
 	return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
 	    vd->vdev_detached || vd->vdev_top->vdev_removing ||
 	    vd->vdev_top->vdev_rz_expanding);
 }
 
 static void
 vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
 {
 	/*
 	 * We pass in the guid instead of the vdev_t since the vdev may
 	 * have been freed prior to the sync task being processed. This
 	 * happens when a vdev is detached as we call spa_config_vdev_exit(),
 	 * stop the initializing thread, schedule the sync task, and free
 	 * the vdev. Later when the scheduled sync task is invoked, it would
 	 * find that the vdev has been freed.
 	 */
 	uint64_t guid = *(uint64_t *)arg;
 	uint64_t txg = dmu_tx_get_txg(tx);
 	kmem_free(arg, sizeof (uint64_t));
 
 	vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
 	if (vd == NULL || vd->vdev_top->vdev_removing ||
 	    !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding)
 		return;
 
 	uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
 	vd->vdev_initialize_offset[txg & TXG_MASK] = 0;
 
 	VERIFY(vd->vdev_leaf_zap != 0);
 
 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
 
 	if (last_offset > 0) {
 		vd->vdev_initialize_last_offset = last_offset;
 		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
 		    VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
 		    sizeof (last_offset), 1, &last_offset, tx));
 	}
 	if (vd->vdev_initialize_action_time > 0) {
 		uint64_t val = (uint64_t)vd->vdev_initialize_action_time;
 		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
 		    VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val),
 		    1, &val, tx));
 	}
 
 	uint64_t initialize_state = vd->vdev_initialize_state;
 	VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
 	    VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1,
 	    &initialize_state, tx));
 }
 
 static void
 vdev_initialize_zap_remove_sync(void *arg, dmu_tx_t *tx)
 {
 	uint64_t guid = *(uint64_t *)arg;
 
 	kmem_free(arg, sizeof (uint64_t));
 
 	vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
 	if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
 		return;
 
 	ASSERT3S(vd->vdev_initialize_state, ==, VDEV_INITIALIZE_NONE);
 	ASSERT3U(vd->vdev_leaf_zap, !=, 0);
 
 	vd->vdev_initialize_last_offset = 0;
 	vd->vdev_initialize_action_time = 0;
 
 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
 	int error;
 
 	error = zap_remove(mos, vd->vdev_leaf_zap,
 	    VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, tx);
 	VERIFY(error == 0 || error == ENOENT);
 
 	error = zap_remove(mos, vd->vdev_leaf_zap,
 	    VDEV_LEAF_ZAP_INITIALIZE_STATE, tx);
 	VERIFY(error == 0 || error == ENOENT);
 
 	error = zap_remove(mos, vd->vdev_leaf_zap,
 	    VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, tx);
 	VERIFY(error == 0 || error == ENOENT);
 }
 
 static void
 vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
 	spa_t *spa = vd->vdev_spa;
 
 	if (new_state == vd->vdev_initialize_state)
 		return;
 
 	/*
 	 * Copy the vd's guid, this will be freed by the sync task.
 	 */
 	uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
 	*guid = vd->vdev_guid;
 
 	/*
 	 * If we're suspending, then preserving the original start time.
 	 */
 	if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
 		vd->vdev_initialize_action_time = gethrestime_sec();
 	}
 
 	vdev_initializing_state_t old_state = vd->vdev_initialize_state;
 	vd->vdev_initialize_state = new_state;
 
 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 
 	if (new_state != VDEV_INITIALIZE_NONE) {
 		dsl_sync_task_nowait(spa_get_dsl(spa),
 		    vdev_initialize_zap_update_sync, guid, tx);
 	} else {
 		dsl_sync_task_nowait(spa_get_dsl(spa),
 		    vdev_initialize_zap_remove_sync, guid, tx);
 	}
 
 	switch (new_state) {
 	case VDEV_INITIALIZE_ACTIVE:
 		spa_history_log_internal(spa, "initialize", tx,
 		    "vdev=%s activated", vd->vdev_path);
 		break;
 	case VDEV_INITIALIZE_SUSPENDED:
 		spa_history_log_internal(spa, "initialize", tx,
 		    "vdev=%s suspended", vd->vdev_path);
 		break;
 	case VDEV_INITIALIZE_CANCELED:
 		if (old_state == VDEV_INITIALIZE_ACTIVE ||
 		    old_state == VDEV_INITIALIZE_SUSPENDED)
 			spa_history_log_internal(spa, "initialize", tx,
 			    "vdev=%s canceled", vd->vdev_path);
 		break;
 	case VDEV_INITIALIZE_COMPLETE:
 		spa_history_log_internal(spa, "initialize", tx,
 		    "vdev=%s complete", vd->vdev_path);
 		break;
 	case VDEV_INITIALIZE_NONE:
 		spa_history_log_internal(spa, "uninitialize", tx,
 		    "vdev=%s", vd->vdev_path);
 		break;
 	default:
 		panic("invalid state %llu", (unsigned long long)new_state);
 	}
 
 	dmu_tx_commit(tx);
 
 	if (new_state != VDEV_INITIALIZE_ACTIVE)
 		spa_notify_waiters(spa);
 }
 
 static void
 vdev_initialize_cb(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	mutex_enter(&vd->vdev_initialize_io_lock);
 	if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
 		/*
 		 * The I/O failed because the vdev was unavailable; roll the
 		 * last offset back. (This works because spa_sync waits on
 		 * spa_txg_zio before it runs sync tasks.)
 		 */
 		uint64_t *off =
 		    &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK];
 		*off = MIN(*off, zio->io_offset);
 	} else {
 		/*
 		 * Since initializing is best-effort, we ignore I/O errors and
 		 * rely on vdev_probe to determine if the errors are more
 		 * critical.
 		 */
 		if (zio->io_error != 0)
 			vd->vdev_stat.vs_initialize_errors++;
 
 		vd->vdev_initialize_bytes_done += zio->io_orig_size;
 	}
 	ASSERT3U(vd->vdev_initialize_inflight, >, 0);
 	vd->vdev_initialize_inflight--;
 	cv_broadcast(&vd->vdev_initialize_io_cv);
 	mutex_exit(&vd->vdev_initialize_io_lock);
 
 	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
 }
 
 /* Takes care of physical writing and limiting # of concurrent ZIOs. */
 static int
 vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	/* Limit inflight initializing I/Os */
 	mutex_enter(&vd->vdev_initialize_io_lock);
 	while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
 		cv_wait(&vd->vdev_initialize_io_cv,
 		    &vd->vdev_initialize_io_lock);
 	}
 	vd->vdev_initialize_inflight++;
 	mutex_exit(&vd->vdev_initialize_io_lock);
 
 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 	uint64_t txg = dmu_tx_get_txg(tx);
 
 	spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
 	mutex_enter(&vd->vdev_initialize_lock);
 
 	if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
 		uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
 		*guid = vd->vdev_guid;
 
 		/* This is the first write of this txg. */
 		dsl_sync_task_nowait(spa_get_dsl(spa),
 		    vdev_initialize_zap_update_sync, guid, tx);
 	}
 
 	/*
 	 * We know the vdev struct will still be around since all
 	 * consumers of vdev_free must stop the initialization first.
 	 */
 	if (vdev_initialize_should_stop(vd)) {
 		mutex_enter(&vd->vdev_initialize_io_lock);
 		ASSERT3U(vd->vdev_initialize_inflight, >, 0);
 		vd->vdev_initialize_inflight--;
 		mutex_exit(&vd->vdev_initialize_io_lock);
 		spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
 		mutex_exit(&vd->vdev_initialize_lock);
 		dmu_tx_commit(tx);
 		return (SET_ERROR(EINTR));
 	}
 	mutex_exit(&vd->vdev_initialize_lock);
 
 	vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
 	zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
 	    size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
 	    ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
 	/* vdev_initialize_cb releases SCL_STATE_ALL */
 
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
 /*
  * Callback to fill each ABD chunk with zfs_initialize_value. len must be
  * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
  * allocation will guarantee these for us.
  */
 static int
 vdev_initialize_block_fill(void *buf, size_t len, void *unused)
 {
 	(void) unused;
 
 	ASSERT0(len % sizeof (uint64_t));
 	for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
 		*(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
 	}
 	return (0);
 }
 
 static abd_t *
 vdev_initialize_block_alloc(void)
 {
 	/* Allocate ABD for filler data */
 	abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE);
 
 	ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t));
 	(void) abd_iterate_func(data, 0, zfs_initialize_chunk_size,
 	    vdev_initialize_block_fill, NULL);
 
 	return (data);
 }
 
 static void
 vdev_initialize_block_free(abd_t *data)
 {
 	abd_free(data);
 }
 
 static int
 vdev_initialize_ranges(vdev_t *vd, abd_t *data)
 {
 	zfs_range_tree_t *rt = vd->vdev_initialize_tree;
 	zfs_btree_t *bt = &rt->rt_root;
 	zfs_btree_index_t where;
 
 	for (zfs_range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL;
 	    rs = zfs_btree_next(bt, &where, &where)) {
 		uint64_t size = zfs_rs_get_end(rs, rt) -
 		    zfs_rs_get_start(rs, rt);
 
 		/* Split range into legally-sized physical chunks */
 		uint64_t writes_required =
 		    ((size - 1) / zfs_initialize_chunk_size) + 1;
 
 		for (uint64_t w = 0; w < writes_required; w++) {
 			int error;
 
 			error = vdev_initialize_write(vd,
 			    VDEV_LABEL_START_SIZE + zfs_rs_get_start(rs, rt) +
 			    (w * zfs_initialize_chunk_size),
 			    MIN(size - (w * zfs_initialize_chunk_size),
 			    zfs_initialize_chunk_size), data);
 			if (error != 0)
 				return (error);
 		}
 	}
 	return (0);
 }
 
 static void
-vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
+vdev_initialize_xlate_last_rs_end(void *arg, zfs_range_seg64_t *physical_rs)
 {
 	uint64_t *last_rs_end = (uint64_t *)arg;
 
 	if (physical_rs->rs_end > *last_rs_end)
 		*last_rs_end = physical_rs->rs_end;
 }
 
 static void
-vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs)
+vdev_initialize_xlate_progress(void *arg, zfs_range_seg64_t *physical_rs)
 {
 	vdev_t *vd = (vdev_t *)arg;
 
 	uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
 	vd->vdev_initialize_bytes_est += size;
 
 	if (vd->vdev_initialize_last_offset > physical_rs->rs_end) {
 		vd->vdev_initialize_bytes_done += size;
 	} else if (vd->vdev_initialize_last_offset > physical_rs->rs_start &&
 	    vd->vdev_initialize_last_offset < physical_rs->rs_end) {
 		vd->vdev_initialize_bytes_done +=
 		    vd->vdev_initialize_last_offset - physical_rs->rs_start;
 	}
 }
 
 static void
 vdev_initialize_calculate_progress(vdev_t *vd)
 {
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
 	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
 	ASSERT(vd->vdev_leaf_zap != 0);
 
 	vd->vdev_initialize_bytes_est = 0;
 	vd->vdev_initialize_bytes_done = 0;
 
 	for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
 		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
 		mutex_enter(&msp->ms_lock);
 
 		uint64_t ms_free = (msp->ms_size -
 		    metaslab_allocated_space(msp)) /
 		    vdev_get_ndisks(vd->vdev_top);
 
 		/*
 		 * Convert the metaslab range to a physical range
 		 * on our vdev. We use this to determine if we are
 		 * in the middle of this metaslab range.
 		 */
-		range_seg64_t logical_rs, physical_rs, remain_rs;
+		zfs_range_seg64_t logical_rs, physical_rs, remain_rs;
 		logical_rs.rs_start = msp->ms_start;
 		logical_rs.rs_end = msp->ms_start + msp->ms_size;
 
 		/* Metaslab space after this offset has not been initialized */
 		vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
 		if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
 			vd->vdev_initialize_bytes_est += ms_free;
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		/* Metaslab space before this offset has been initialized */
 		uint64_t last_rs_end = physical_rs.rs_end;
 		if (!vdev_xlate_is_empty(&remain_rs)) {
 			vdev_xlate_walk(vd, &remain_rs,
 			    vdev_initialize_xlate_last_rs_end, &last_rs_end);
 		}
 
 		if (vd->vdev_initialize_last_offset > last_rs_end) {
 			vd->vdev_initialize_bytes_done += ms_free;
 			vd->vdev_initialize_bytes_est += ms_free;
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		/*
 		 * If we get here, we're in the middle of initializing this
 		 * metaslab. Load it and walk the free tree for more accurate
 		 * progress estimation.
 		 */
 		VERIFY0(metaslab_load(msp));
 
 		zfs_btree_index_t where;
 		zfs_range_tree_t *rt = msp->ms_allocatable;
 		for (zfs_range_seg_t *rs =
 		    zfs_btree_first(&rt->rt_root, &where); rs;
 		    rs = zfs_btree_next(&rt->rt_root, &where,
 		    &where)) {
 			logical_rs.rs_start = zfs_rs_get_start(rs, rt);
 			logical_rs.rs_end = zfs_rs_get_end(rs, rt);
 
 			vdev_xlate_walk(vd, &logical_rs,
 			    vdev_initialize_xlate_progress, vd);
 		}
 		mutex_exit(&msp->ms_lock);
 	}
 }
 
 static int
 vdev_initialize_load(vdev_t *vd)
 {
 	int err = 0;
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
 	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
 	ASSERT(vd->vdev_leaf_zap != 0);
 
 	if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE ||
 	    vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) {
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
 		    sizeof (vd->vdev_initialize_last_offset), 1,
 		    &vd->vdev_initialize_last_offset);
 		if (err == ENOENT) {
 			vd->vdev_initialize_last_offset = 0;
 			err = 0;
 		}
 	}
 
 	vdev_initialize_calculate_progress(vd);
 	return (err);
 }
 
 static void
-vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs)
+vdev_initialize_xlate_range_add(void *arg, zfs_range_seg64_t *physical_rs)
 {
 	vdev_t *vd = arg;
 
 	/* Only add segments that we have not visited yet */
 	if (physical_rs->rs_end <= vd->vdev_initialize_last_offset)
 		return;
 
 	/* Pick up where we left off mid-range. */
 	if (vd->vdev_initialize_last_offset > physical_rs->rs_start) {
 		zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
 		    "(%llu, %llu)", vd->vdev_path,
 		    (u_longlong_t)physical_rs->rs_start,
 		    (u_longlong_t)physical_rs->rs_end,
 		    (u_longlong_t)vd->vdev_initialize_last_offset,
 		    (u_longlong_t)physical_rs->rs_end);
 		ASSERT3U(physical_rs->rs_end, >,
 		    vd->vdev_initialize_last_offset);
 		physical_rs->rs_start = vd->vdev_initialize_last_offset;
 	}
 
 	ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
 
 	zfs_range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start,
 	    physical_rs->rs_end - physical_rs->rs_start);
 }
 
 /*
  * Convert the logical range into a physical range and add it to our
  * avl tree.
  */
 static void
 vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
 {
 	vdev_t *vd = arg;
-	range_seg64_t logical_rs;
+	zfs_range_seg64_t logical_rs;
 	logical_rs.rs_start = start;
 	logical_rs.rs_end = start + size;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 	vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg);
 }
 
 static __attribute__((noreturn)) void
 vdev_initialize_thread(void *arg)
 {
 	vdev_t *vd = arg;
 	spa_t *spa = vd->vdev_spa;
 	int error = 0;
 	uint64_t ms_count = 0;
 
 	ASSERT(vdev_is_concrete(vd));
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	vd->vdev_initialize_last_offset = 0;
 	VERIFY0(vdev_initialize_load(vd));
 
 	abd_t *deadbeef = vdev_initialize_block_alloc();
 
 	vd->vdev_initialize_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
 	    NULL, 0, 0);
 
 	for (uint64_t i = 0; !vd->vdev_detached &&
 	    i < vd->vdev_top->vdev_ms_count; i++) {
 		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
 		boolean_t unload_when_done = B_FALSE;
 
 		/*
 		 * If we've expanded the top-level vdev or it's our
 		 * first pass, calculate our progress.
 		 */
 		if (vd->vdev_top->vdev_ms_count != ms_count) {
 			vdev_initialize_calculate_progress(vd);
 			ms_count = vd->vdev_top->vdev_ms_count;
 		}
 
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		metaslab_disable(msp);
 		mutex_enter(&msp->ms_lock);
 		if (!msp->ms_loaded && !msp->ms_loading)
 			unload_when_done = B_TRUE;
 		VERIFY0(metaslab_load(msp));
 
 		zfs_range_tree_walk(msp->ms_allocatable,
 		    vdev_initialize_range_add, vd);
 		mutex_exit(&msp->ms_lock);
 
 		error = vdev_initialize_ranges(vd, deadbeef);
 		metaslab_enable(msp, B_TRUE, unload_when_done);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 		zfs_range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
 		if (error != 0)
 			break;
 	}
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 	mutex_enter(&vd->vdev_initialize_io_lock);
 	while (vd->vdev_initialize_inflight > 0) {
 		cv_wait(&vd->vdev_initialize_io_cv,
 		    &vd->vdev_initialize_io_lock);
 	}
 	mutex_exit(&vd->vdev_initialize_io_lock);
 
 	zfs_range_tree_destroy(vd->vdev_initialize_tree);
 	vdev_initialize_block_free(deadbeef);
 	vd->vdev_initialize_tree = NULL;
 
 	mutex_enter(&vd->vdev_initialize_lock);
 	if (!vd->vdev_initialize_exit_wanted) {
 		if (vdev_writeable(vd)) {
 			vdev_initialize_change_state(vd,
 			    VDEV_INITIALIZE_COMPLETE);
 		} else if (vd->vdev_faulted) {
 			vdev_initialize_change_state(vd,
 			    VDEV_INITIALIZE_CANCELED);
 		}
 	}
 	ASSERT(vd->vdev_initialize_thread != NULL ||
 	    vd->vdev_initialize_inflight == 0);
 
 	/*
 	 * Drop the vdev_initialize_lock while we sync out the
 	 * txg since it's possible that a device might be trying to
 	 * come online and must check to see if it needs to restart an
 	 * initialization. That thread will be holding the spa_config_lock
 	 * which would prevent the txg_wait_synced from completing.
 	 */
 	mutex_exit(&vd->vdev_initialize_lock);
 	txg_wait_synced(spa_get_dsl(spa), 0);
 	mutex_enter(&vd->vdev_initialize_lock);
 
 	vd->vdev_initialize_thread = NULL;
 	cv_broadcast(&vd->vdev_initialize_cv);
 	mutex_exit(&vd->vdev_initialize_lock);
 
 	thread_exit();
 }
 
 /*
  * Initiates a device. Caller must hold vdev_initialize_lock.
  * Device must be a leaf and not already be initializing.
  */
 void
 vdev_initialize(vdev_t *vd)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 	ASSERT(!vd->vdev_detached);
 	ASSERT(!vd->vdev_initialize_exit_wanted);
 	ASSERT(!vd->vdev_top->vdev_removing);
 	ASSERT(!vd->vdev_top->vdev_rz_expanding);
 
 	vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
 	vd->vdev_initialize_thread = thread_create(NULL, 0,
 	    vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
 }
 
 /*
  * Uninitializes a device. Caller must hold vdev_initialize_lock.
  * Device must be a leaf and not already be initializing.
  */
 void
 vdev_uninitialize(vdev_t *vd)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 	ASSERT(!vd->vdev_detached);
 	ASSERT(!vd->vdev_initialize_exit_wanted);
 	ASSERT(!vd->vdev_top->vdev_removing);
 
 	vdev_initialize_change_state(vd, VDEV_INITIALIZE_NONE);
 }
 
 /*
  * Wait for the initialize thread to be terminated (cancelled or stopped).
  */
 static void
 vdev_initialize_stop_wait_impl(vdev_t *vd)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
 
 	while (vd->vdev_initialize_thread != NULL)
 		cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
 
 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 	vd->vdev_initialize_exit_wanted = B_FALSE;
 }
 
 /*
  * Wait for vdev initialize threads which were either to cleanly exit.
  */
 void
 vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list)
 {
 	(void) spa;
 	vdev_t *vd;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 	    spa->spa_export_thread == curthread);
 
 	while ((vd = list_remove_head(vd_list)) != NULL) {
 		mutex_enter(&vd->vdev_initialize_lock);
 		vdev_initialize_stop_wait_impl(vd);
 		mutex_exit(&vd->vdev_initialize_lock);
 	}
 }
 
 /*
  * Stop initializing a device, with the resultant initializing state being
  * tgt_state.  For blocking behavior pass NULL for vd_list.  Otherwise, when
  * a list_t is provided the stopping vdev is inserted in to the list.  Callers
  * are then required to call vdev_initialize_stop_wait() to block for all the
  * initialization threads to exit.  The caller must hold vdev_initialize_lock
  * and must not be writing to the spa config, as the initializing thread may
  * try to enter the config as a reader before exiting.
  */
 void
 vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state,
     list_t *vd_list)
 {
 	ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER));
 	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 	ASSERT(vdev_is_concrete(vd));
 
 	/*
 	 * Allow cancel requests to proceed even if the initialize thread
 	 * has stopped.
 	 */
 	if (vd->vdev_initialize_thread == NULL &&
 	    tgt_state != VDEV_INITIALIZE_CANCELED) {
 		return;
 	}
 
 	vdev_initialize_change_state(vd, tgt_state);
 	vd->vdev_initialize_exit_wanted = B_TRUE;
 
 	if (vd_list == NULL) {
 		vdev_initialize_stop_wait_impl(vd);
 	} else {
 		ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 		    vd->vdev_spa->spa_export_thread == curthread);
 		list_insert_tail(vd_list, vd);
 	}
 }
 
 static void
 vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state,
     list_t *vd_list)
 {
 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
 		mutex_enter(&vd->vdev_initialize_lock);
 		vdev_initialize_stop(vd, tgt_state, vd_list);
 		mutex_exit(&vd->vdev_initialize_lock);
 		return;
 	}
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state,
 		    vd_list);
 	}
 }
 
 /*
  * Convenience function to stop initializing of a vdev tree and set all
  * initialize thread pointers to NULL.
  */
 void
 vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
 {
 	spa_t *spa = vd->vdev_spa;
 	list_t vd_list;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 	    spa->spa_export_thread == curthread);
 
 	list_create(&vd_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_initialize_node));
 
 	vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list);
 	vdev_initialize_stop_wait(spa, &vd_list);
 
 	if (vd->vdev_spa->spa_sync_on) {
 		/* Make sure that our state has been synced to disk */
 		txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
 	}
 
 	list_destroy(&vd_list);
 }
 
 void
 vdev_initialize_restart(vdev_t *vd)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 	    vd->vdev_spa->spa_load_thread == curthread);
 	ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 
 	if (vd->vdev_leaf_zap != 0) {
 		mutex_enter(&vd->vdev_initialize_lock);
 		uint64_t initialize_state = VDEV_INITIALIZE_NONE;
 		int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE,
 		    sizeof (initialize_state), 1, &initialize_state);
 		ASSERT(err == 0 || err == ENOENT);
 		vd->vdev_initialize_state = initialize_state;
 
 		uint64_t timestamp = 0;
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME,
 		    sizeof (timestamp), 1, &timestamp);
 		ASSERT(err == 0 || err == ENOENT);
 		vd->vdev_initialize_action_time = timestamp;
 
 		if ((vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
 		    vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) {
 			/* load progress for reporting, but don't resume */
 			VERIFY0(vdev_initialize_load(vd));
 		} else if (vd->vdev_initialize_state ==
 		    VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) &&
 		    !vd->vdev_top->vdev_removing &&
 		    !vd->vdev_top->vdev_rz_expanding &&
 		    vd->vdev_initialize_thread == NULL) {
 			vdev_initialize(vd);
 		}
 
 		mutex_exit(&vd->vdev_initialize_lock);
 	}
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		vdev_initialize_restart(vd->vdev_child[i]);
 	}
 }
 
 EXPORT_SYMBOL(vdev_initialize);
 EXPORT_SYMBOL(vdev_uninitialize);
 EXPORT_SYMBOL(vdev_initialize_stop);
 EXPORT_SYMBOL(vdev_initialize_stop_all);
 EXPORT_SYMBOL(vdev_initialize_stop_wait);
 EXPORT_SYMBOL(vdev_initialize_restart);
 
 ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, U64, ZMOD_RW,
 	"Value written during zpool initialize");
 
 ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, U64, ZMOD_RW,
 	"Size in bytes of writes by zpool initialize");
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 9d12bc2eb0a2..2c4e0c1c4848 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -1,2166 +1,2167 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  */
 
 /*
  * Virtual Device Labels
  * ---------------------
  *
  * The vdev label serves several distinct purposes:
  *
  *	1. Uniquely identify this device as part of a ZFS pool and confirm its
  *	   identity within the pool.
  *
  *	2. Verify that all the devices given in a configuration are present
  *         within the pool.
  *
  *	3. Determine the uberblock for the pool.
  *
  *	4. In case of an import operation, determine the configuration of the
  *         toplevel vdev of which it is a part.
  *
  *	5. If an import operation cannot find all the devices in the pool,
  *         provide enough information to the administrator to determine which
  *         devices are missing.
  *
  * It is important to note that while the kernel is responsible for writing the
  * label, it only consumes the information in the first three cases.  The
  * latter information is only consumed in userland when determining the
  * configuration to import a pool.
  *
  *
  * Label Organization
  * ------------------
  *
  * Before describing the contents of the label, it's important to understand how
  * the labels are written and updated with respect to the uberblock.
  *
  * When the pool configuration is altered, either because it was newly created
  * or a device was added, we want to update all the labels such that we can deal
  * with fatal failure at any point.  To this end, each disk has two labels which
  * are updated before and after the uberblock is synced.  Assuming we have
  * labels and an uberblock with the following transaction groups:
  *
  *              L1          UB          L2
  *           +------+    +------+    +------+
  *           |      |    |      |    |      |
  *           | t10  |    | t10  |    | t10  |
  *           |      |    |      |    |      |
  *           +------+    +------+    +------+
  *
  * In this stable state, the labels and the uberblock were all updated within
  * the same transaction group (10).  Each label is mirrored and checksummed, so
  * that we can detect when we fail partway through writing the label.
  *
  * In order to identify which labels are valid, the labels are written in the
  * following manner:
  *
  *	1. For each vdev, update 'L1' to the new label
  *	2. Update the uberblock
  *	3. For each vdev, update 'L2' to the new label
  *
  * Given arbitrary failure, we can determine the correct label to use based on
  * the transaction group.  If we fail after updating L1 but before updating the
  * UB, we will notice that L1's transaction group is greater than the uberblock,
  * so L2 must be valid.  If we fail after writing the uberblock but before
  * writing L2, we will notice that L2's transaction group is less than L1, and
  * therefore L1 is valid.
  *
  * Another added complexity is that not every label is updated when the config
  * is synced.  If we add a single device, we do not want to have to re-write
  * every label for every device in the pool.  This means that both L1 and L2 may
  * be older than the pool uberblock, because the necessary information is stored
  * on another vdev.
  *
  *
  * On-disk Format
  * --------------
  *
  * The vdev label consists of two distinct parts, and is wrapped within the
  * vdev_label_t structure.  The label includes 8k of padding to permit legacy
  * VTOC disk labels, but is otherwise ignored.
  *
  * The first half of the label is a packed nvlist which contains pool wide
  * properties, per-vdev properties, and configuration information.  It is
  * described in more detail below.
  *
  * The latter half of the label consists of a redundant array of uberblocks.
  * These uberblocks are updated whenever a transaction group is committed,
  * or when the configuration is updated.  When a pool is loaded, we scan each
  * vdev for the 'best' uberblock.
  *
  *
  * Configuration Information
  * -------------------------
  *
  * The nvlist describing the pool and vdev contains the following elements:
  *
  *	version		ZFS on-disk version
  *	name		Pool name
  *	state		Pool state
  *	txg		Transaction group in which this label was written
  *	pool_guid	Unique identifier for this pool
  *	vdev_tree	An nvlist describing vdev tree.
  *	features_for_read
  *			An nvlist of the features necessary for reading the MOS.
  *
  * Each leaf device label also contains the following:
  *
  *	top_guid	Unique ID for top-level vdev in which this is contained
  *	guid		Unique ID for the leaf vdev
  *
  * The 'vs' configuration follows the format described in 'spa_config.c'.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_draid.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/zio.h>
 #include <sys/dsl_scan.h>
 #include <sys/abd.h>
 #include <sys/fs/zfs.h>
 #include <sys/byteorder.h>
 #include <sys/zfs_bootenv.h>
 
 /*
  * Basic routines to read and write from a vdev label.
  * Used throughout the rest of this file.
  */
 uint64_t
 vdev_label_offset(uint64_t psize, int l, uint64_t offset)
 {
 	ASSERT(offset < sizeof (vdev_label_t));
 	ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0);
 
 	return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
 	    0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
 }
 
 /*
  * Returns back the vdev label associated with the passed in offset.
  */
 int
 vdev_label_number(uint64_t psize, uint64_t offset)
 {
 	int l;
 
 	if (offset >= psize - VDEV_LABEL_END_SIZE) {
 		offset -= psize - VDEV_LABEL_END_SIZE;
 		offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t);
 	}
 	l = offset / sizeof (vdev_label_t);
 	return (l < VDEV_LABELS ? l : -1);
 }
 
 static void
 vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
     uint64_t size, zio_done_func_t *done, void *private, int flags)
 {
 	ASSERT(
 	    spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE ||
 	    spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE);
 	ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
 
 	zio_nowait(zio_read_phys(zio, vd,
 	    vdev_label_offset(vd->vdev_psize, l, offset),
 	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
 	    ZIO_PRIORITY_SYNC_READ, flags, B_TRUE));
 }
 
 void
 vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
     uint64_t size, zio_done_func_t *done, void *private, int flags)
 {
 	ASSERT(
 	    spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE ||
 	    spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE);
 	ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
 
 	zio_nowait(zio_write_phys(zio, vd,
 	    vdev_label_offset(vd->vdev_psize, l, offset),
 	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
 	    ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE));
 }
 
 /*
  * Generate the nvlist representing this vdev's stats
  */
 void
 vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
 {
 	nvlist_t *nvx;
 	vdev_stat_t *vs;
 	vdev_stat_ex_t *vsx;
 
 	vs = kmem_alloc(sizeof (*vs), KM_SLEEP);
 	vsx = kmem_alloc(sizeof (*vsx), KM_SLEEP);
 
 	vdev_get_stats_ex(vd, vs, vsx);
 	fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
 	    (uint64_t *)vs, sizeof (*vs) / sizeof (uint64_t));
 
 	/*
 	 * Add extended stats into a special extended stats nvlist.  This keeps
 	 * all the extended stats nicely grouped together.  The extended stats
 	 * nvlist is then added to the main nvlist.
 	 */
 	nvx = fnvlist_alloc();
 
 	/* ZIOs in flight to disk */
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,
 	    vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_READ]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,
 	    vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_WRITE]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,
 	    vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_READ]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
 	    vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_WRITE]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
 	    vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
 	    vsx->vsx_active_queue[ZIO_PRIORITY_TRIM]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE,
 	    vsx->vsx_active_queue[ZIO_PRIORITY_REBUILD]);
 
 	/* ZIOs pending */
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,
 	    vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE,
 	    vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_WRITE]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE,
 	    vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_READ]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE,
 	    vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_WRITE]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,
 	    vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE,
 	    vsx->vsx_pend_queue[ZIO_PRIORITY_TRIM]);
 
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE,
 	    vsx->vsx_pend_queue[ZIO_PRIORITY_REBUILD]);
 
 	/* Histograms */
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
 	    vsx->vsx_total_histo[ZIO_TYPE_READ],
 	    ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_READ]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
 	    vsx->vsx_total_histo[ZIO_TYPE_WRITE],
 	    ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_WRITE]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
 	    vsx->vsx_disk_histo[ZIO_TYPE_READ],
 	    ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_READ]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
 	    vsx->vsx_disk_histo[ZIO_TYPE_WRITE],
 	    ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_WRITE]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,
 	    vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ],
 	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,
 	    vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE],
 	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
 	    vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ],
 	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
 	    vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE],
 	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
 	    vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB],
 	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
 	    vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM],
 	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO,
 	    vsx->vsx_queue_histo[ZIO_PRIORITY_REBUILD],
 	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_REBUILD]));
 
 	/* Request sizes */
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO,
 	    vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ],
 	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO,
 	    vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE],
 	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO,
 	    vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ],
 	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO,
 	    vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE],
 	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO,
 	    vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB],
 	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,
 	    vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM],
 	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO,
 	    vsx->vsx_ind_histo[ZIO_PRIORITY_REBUILD],
 	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_REBUILD]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO,
 	    vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ],
 	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO,
 	    vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE],
 	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO,
 	    vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ],
 	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO,
 	    vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE],
 	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO,
 	    vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB],
 	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,
 	    vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM],
 	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM]));
 
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO,
 	    vsx->vsx_agg_histo[ZIO_PRIORITY_REBUILD],
 	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_REBUILD]));
 
 	/* IO delays */
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);
 
 	/* Direct I/O write verify errors */
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_DIO_VERIFY_ERRORS,
 	    vs->vs_dio_verify_errors);
 
 	/* Add extended stats nvlist to main nvlist */
 	fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx);
 
 	fnvlist_free(nvx);
 	kmem_free(vs, sizeof (*vs));
 	kmem_free(vsx, sizeof (*vsx));
 }
 
 static void
 root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	if (vd != spa->spa_root_vdev)
 		return;
 
 	/* provide either current or previous scan information */
 	pool_scan_stat_t ps;
 	if (spa_scan_get_stats(spa, &ps) == 0) {
 		fnvlist_add_uint64_array(nvl,
 		    ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
 		    sizeof (pool_scan_stat_t) / sizeof (uint64_t));
 	}
 
 	pool_removal_stat_t prs;
 	if (spa_removal_get_stats(spa, &prs) == 0) {
 		fnvlist_add_uint64_array(nvl,
 		    ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
 		    sizeof (prs) / sizeof (uint64_t));
 	}
 
 	pool_checkpoint_stat_t pcs;
 	if (spa_checkpoint_get_stats(spa, &pcs) == 0) {
 		fnvlist_add_uint64_array(nvl,
 		    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs,
 		    sizeof (pcs) / sizeof (uint64_t));
 	}
 
 	pool_raidz_expand_stat_t pres;
 	if (spa_raidz_expand_get_stats(spa, &pres) == 0) {
 		fnvlist_add_uint64_array(nvl,
 		    ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t *)&pres,
 		    sizeof (pres) / sizeof (uint64_t));
 	}
 }
 
 static void
 top_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
 {
 	if (vd == vd->vdev_top) {
 		vdev_rebuild_stat_t vrs;
 		if (vdev_rebuild_get_stats(vd, &vrs) == 0) {
 			fnvlist_add_uint64_array(nvl,
 			    ZPOOL_CONFIG_REBUILD_STATS, (uint64_t *)&vrs,
 			    sizeof (vrs) / sizeof (uint64_t));
 		}
 	}
 }
 
 /*
  * Generate the nvlist representing this vdev's config.
  */
 nvlist_t *
 vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
     vdev_config_flag_t flags)
 {
 	nvlist_t *nv = NULL;
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 	nv = fnvlist_alloc();
 
 	fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
 	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
 
 	if (vd->vdev_path != NULL)
 		fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
 
 	if (vd->vdev_devid != NULL)
 		fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
 
 	if (vd->vdev_physpath != NULL)
 		fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
 		    vd->vdev_physpath);
 
 	if (vd->vdev_enc_sysfs_path != NULL)
 		fnvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
 		    vd->vdev_enc_sysfs_path);
 
 	if (vd->vdev_fru != NULL)
 		fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
 
 	if (vd->vdev_ops->vdev_op_config_generate != NULL)
 		vd->vdev_ops->vdev_op_config_generate(vd, nv);
 
 	if (vd->vdev_wholedisk != -1ULL) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 		    vd->vdev_wholedisk);
 	}
 
 	if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
 
 	if (vd->vdev_isspare)
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
 
 	if (flags & VDEV_CONFIG_L2CACHE)
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
 
 	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
 	    vd == vd->vdev_top) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 		    vd->vdev_ms_array);
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 		    vd->vdev_ms_shift);
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    vd->vdev_asize);
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
 		if (vd->vdev_noalloc) {
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
 			    vd->vdev_noalloc);
 		}
 
 		/*
 		 * Slog devices are removed synchronously so don't
 		 * persist the vdev_removing flag to the label.
 		 */
 		if (vd->vdev_removing && !vd->vdev_islog) {
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
 			    vd->vdev_removing);
 		}
 
 		/* zpool command expects alloc class data */
 		if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) {
 			const char *bias = NULL;
 
 			switch (vd->vdev_alloc_bias) {
 			case VDEV_BIAS_LOG:
 				bias = VDEV_ALLOC_BIAS_LOG;
 				break;
 			case VDEV_BIAS_SPECIAL:
 				bias = VDEV_ALLOC_BIAS_SPECIAL;
 				break;
 			case VDEV_BIAS_DEDUP:
 				bias = VDEV_ALLOC_BIAS_DEDUP;
 				break;
 			default:
 				ASSERT3U(vd->vdev_alloc_bias, ==,
 				    VDEV_BIAS_NONE);
 			}
 			fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
 			    bias);
 		}
 	}
 
 	if (vd->vdev_dtl_sm != NULL) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
 		    space_map_object(vd->vdev_dtl_sm));
 	}
 
 	if (vic->vic_mapping_object != 0) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
 		    vic->vic_mapping_object);
 	}
 
 	if (vic->vic_births_object != 0) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
 		    vic->vic_births_object);
 	}
 
 	if (vic->vic_prev_indirect_vdev != UINT64_MAX) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
 		    vic->vic_prev_indirect_vdev);
 	}
 
 	if (vd->vdev_crtxg)
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
 
 	if (vd->vdev_expansion_time)
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_EXPANSION_TIME,
 		    vd->vdev_expansion_time);
 
 	if (flags & VDEV_CONFIG_MOS) {
 		if (vd->vdev_leaf_zap != 0) {
 			ASSERT(vd->vdev_ops->vdev_op_leaf);
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP,
 			    vd->vdev_leaf_zap);
 		}
 
 		if (vd->vdev_top_zap != 0) {
 			ASSERT(vd == vd->vdev_top);
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
 			    vd->vdev_top_zap);
 		}
 
 		if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap != 0 &&
 		    spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) {
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP,
 			    vd->vdev_root_zap);
 		}
 
 		if (vd->vdev_resilver_deferred) {
 			ASSERT(vd->vdev_ops->vdev_op_leaf);
 			ASSERT(spa->spa_resilver_deferred);
 			fnvlist_add_boolean(nv, ZPOOL_CONFIG_RESILVER_DEFER);
 		}
 	}
 
 	if (getstats) {
 		vdev_config_generate_stats(vd, nv);
 
 		root_vdev_actions_getprogress(vd, nv);
 		top_vdev_actions_getprogress(vd, nv);
 
 		/*
 		 * Note: this can be called from open context
 		 * (spa_get_stats()), so we need the rwlock to prevent
 		 * the mapping from being changed by condensing.
 		 */
 		rw_enter(&vd->vdev_indirect_rwlock, RW_READER);
 		if (vd->vdev_indirect_mapping != NULL) {
 			ASSERT(vd->vdev_indirect_births != NULL);
 			vdev_indirect_mapping_t *vim =
 			    vd->vdev_indirect_mapping;
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
 			    vdev_indirect_mapping_size(vim));
 		}
 		rw_exit(&vd->vdev_indirect_rwlock);
 		if (vd->vdev_mg != NULL &&
 		    vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) {
 			/*
 			 * Compute approximately how much memory would be used
 			 * for the indirect mapping if this device were to
 			 * be removed.
 			 *
 			 * Note: If the frag metric is invalid, then not
 			 * enough metaslabs have been converted to have
 			 * histograms.
 			 */
 			uint64_t seg_count = 0;
 			uint64_t to_alloc = vd->vdev_stat.vs_alloc;
 
 			/*
 			 * There are the same number of allocated segments
 			 * as free segments, so we will have at least one
 			 * entry per free segment.  However, small free
 			 * segments (smaller than vdev_removal_max_span)
 			 * will be combined with adjacent allocated segments
 			 * as a single mapping.
 			 */
-			for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+			for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE;
+			    i++) {
 				if (i + 1 < highbit64(vdev_removal_max_span)
 				    - 1) {
 					to_alloc +=
 					    vd->vdev_mg->mg_histogram[i] <<
 					    (i + 1);
 				} else {
 					seg_count +=
 					    vd->vdev_mg->mg_histogram[i];
 				}
 			}
 
 			/*
 			 * The maximum length of a mapping is
 			 * zfs_remove_max_segment, so we need at least one entry
 			 * per zfs_remove_max_segment of allocated data.
 			 */
 			seg_count += to_alloc / spa_remove_max_segment(spa);
 
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
 			    seg_count *
 			    sizeof (vdev_indirect_mapping_entry_phys_t));
 		}
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		nvlist_t **child;
 		uint64_t c;
 
 		ASSERT(!vd->vdev_ishole);
 
 		child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
 		    KM_SLEEP);
 
 		for (c = 0; c < vd->vdev_children; c++) {
 			child[c] = vdev_config_generate(spa, vd->vdev_child[c],
 			    getstats, flags);
 		}
 
 		fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 		    (const nvlist_t * const *)child, vd->vdev_children);
 
 		for (c = 0; c < vd->vdev_children; c++)
 			nvlist_free(child[c]);
 
 		kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
 
 	} else {
 		const char *aux = NULL;
 
 		if (vd->vdev_offline && !vd->vdev_tmpoffline)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE);
 		if (vd->vdev_resilver_txg != 0)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 			    vd->vdev_resilver_txg);
 		if (vd->vdev_rebuild_txg != 0)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
 			    vd->vdev_rebuild_txg);
 		if (vd->vdev_faulted)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
 		if (vd->vdev_degraded)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE);
 		if (vd->vdev_removed)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE);
 		if (vd->vdev_unspare)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE);
 		if (vd->vdev_ishole)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);
 
 		/* Set the reason why we're FAULTED/DEGRADED. */
 		switch (vd->vdev_stat.vs_aux) {
 		case VDEV_AUX_ERR_EXCEEDED:
 			aux = "err_exceeded";
 			break;
 
 		case VDEV_AUX_EXTERNAL:
 			aux = "external";
 			break;
 		}
 
 		if (aux != NULL && !vd->vdev_tmpoffline) {
 			fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);
 		} else {
 			/*
 			 * We're healthy - clear any previous AUX_STATE values.
 			 */
 			if (nvlist_exists(nv, ZPOOL_CONFIG_AUX_STATE))
 				nvlist_remove_all(nv, ZPOOL_CONFIG_AUX_STATE);
 		}
 
 		if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
 			    vd->vdev_orig_guid);
 		}
 	}
 
 	return (nv);
 }
 
 /*
  * Generate a view of the top-level vdevs.  If we currently have holes
  * in the namespace, then generate an array which contains a list of holey
  * vdevs.  Additionally, add the number of top-level children that currently
  * exist.
  */
 void
 vdev_top_config_generate(spa_t *spa, nvlist_t *config)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t *array;
 	uint_t c, idx;
 
 	array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
 
 	for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 
 		if (tvd->vdev_ishole) {
 			array[idx++] = c;
 		}
 	}
 
 	if (idx) {
 		VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
 		    array, idx) == 0);
 	}
 
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
 	    rvd->vdev_children) == 0);
 
 	kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
 }
 
 /*
  * Returns the configuration from the label of the given vdev. For vdevs
  * which don't have a txg value stored on their label (i.e. spares/cache)
  * or have not been completely initialized (txg = 0) just return
  * the configuration from the first valid label we find. Otherwise,
  * find the most up-to-date label that does not exceed the specified
  * 'txg' value.
  */
 nvlist_t *
 vdev_label_read_config(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *config = NULL;
 	vdev_phys_t *vp[VDEV_LABELS];
 	abd_t *vp_abd[VDEV_LABELS];
 	zio_t *zio[VDEV_LABELS];
 	uint64_t best_txg = 0;
 	uint64_t label_txg = 0;
 	int error = 0;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SPECULATIVE;
 
 	ASSERT(vd->vdev_validate_thread == curthread ||
 	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (!vdev_readable(vd))
 		return (NULL);
 
 	/*
 	 * The label for a dRAID distributed spare is not stored on disk.
 	 * Instead it is generated when needed which allows us to bypass
 	 * the pipeline when reading the config from the label.
 	 */
 	if (vd->vdev_ops == &vdev_draid_spare_ops)
 		return (vdev_draid_read_config_spare(vd));
 
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		vp_abd[l] = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
 		vp[l] = abd_to_buf(vp_abd[l]);
 	}
 
 retry:
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		zio[l] = zio_root(spa, NULL, NULL, flags);
 
 		vdev_label_read(zio[l], vd, l, vp_abd[l],
 		    offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t),
 		    NULL, NULL, flags);
 	}
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		nvlist_t *label = NULL;
 
 		if (zio_wait(zio[l]) == 0 &&
 		    nvlist_unpack(vp[l]->vp_nvlist, sizeof (vp[l]->vp_nvlist),
 		    &label, 0) == 0) {
 			/*
 			 * Auxiliary vdevs won't have txg values in their
 			 * labels and newly added vdevs may not have been
 			 * completely initialized so just return the
 			 * configuration from the first valid label we
 			 * encounter.
 			 */
 			error = nvlist_lookup_uint64(label,
 			    ZPOOL_CONFIG_POOL_TXG, &label_txg);
 			if ((error || label_txg == 0) && !config) {
 				config = label;
 				for (l++; l < VDEV_LABELS; l++)
 					zio_wait(zio[l]);
 				break;
 			} else if (label_txg <= txg && label_txg > best_txg) {
 				best_txg = label_txg;
 				nvlist_free(config);
 				config = fnvlist_dup(label);
 			}
 		}
 
 		if (label != NULL) {
 			nvlist_free(label);
 			label = NULL;
 		}
 	}
 
 	if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
 		flags |= ZIO_FLAG_TRYHARD;
 		goto retry;
 	}
 
 	/*
 	 * We found a valid label but it didn't pass txg restrictions.
 	 */
 	if (config == NULL && label_txg != 0) {
 		vdev_dbgmsg(vd, "label discarded as txg is too large "
 		    "(%llu > %llu)", (u_longlong_t)label_txg,
 		    (u_longlong_t)txg);
 	}
 
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		abd_free(vp_abd[l]);
 	}
 
 	return (config);
 }
 
 /*
  * Determine if a device is in use.  The 'spare_guid' parameter will be filled
  * in with the device guid if this spare is active elsewhere on the system.
  */
 static boolean_t
 vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
     uint64_t *spare_guid, uint64_t *l2cache_guid)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t state, pool_guid, device_guid, txg, spare_pool;
 	uint64_t vdtxg = 0;
 	nvlist_t *label;
 
 	if (spare_guid)
 		*spare_guid = 0ULL;
 	if (l2cache_guid)
 		*l2cache_guid = 0ULL;
 
 	/*
 	 * Read the label, if any, and perform some basic sanity checks.
 	 */
 	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
 		return (B_FALSE);
 
 	(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
 	    &vdtxg);
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 	    &state) != 0 ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
 	    &device_guid) != 0) {
 		nvlist_free(label);
 		return (B_FALSE);
 	}
 
 	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
 	    (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
 	    &pool_guid) != 0 ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
 	    &txg) != 0)) {
 		nvlist_free(label);
 		return (B_FALSE);
 	}
 
 	nvlist_free(label);
 
 	/*
 	 * Check to see if this device indeed belongs to the pool it claims to
 	 * be a part of.  The only way this is allowed is if the device is a hot
 	 * spare (which we check for later on).
 	 */
 	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
 	    !spa_guid_exists(pool_guid, device_guid) &&
 	    !spa_spare_exists(device_guid, NULL, NULL) &&
 	    !spa_l2cache_exists(device_guid, NULL))
 		return (B_FALSE);
 
 	/*
 	 * If the transaction group is zero, then this an initialized (but
 	 * unused) label.  This is only an error if the create transaction
 	 * on-disk is the same as the one we're using now, in which case the
 	 * user has attempted to add the same vdev multiple times in the same
 	 * transaction.
 	 */
 	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
 	    txg == 0 && vdtxg == crtxg)
 		return (B_TRUE);
 
 	/*
 	 * Check to see if this is a spare device.  We do an explicit check for
 	 * spa_has_spare() here because it may be on our pending list of spares
 	 * to add.
 	 */
 	if (spa_spare_exists(device_guid, &spare_pool, NULL) ||
 	    spa_has_spare(spa, device_guid)) {
 		if (spare_guid)
 			*spare_guid = device_guid;
 
 		switch (reason) {
 		case VDEV_LABEL_CREATE:
 			return (B_TRUE);
 
 		case VDEV_LABEL_REPLACE:
 			return (!spa_has_spare(spa, device_guid) ||
 			    spare_pool != 0ULL);
 
 		case VDEV_LABEL_SPARE:
 			return (spa_has_spare(spa, device_guid));
 		default:
 			break;
 		}
 	}
 
 	/*
 	 * Check to see if this is an l2cache device.
 	 */
 	if (spa_l2cache_exists(device_guid, NULL) ||
 	    spa_has_l2cache(spa, device_guid)) {
 		if (l2cache_guid)
 			*l2cache_guid = device_guid;
 
 		switch (reason) {
 		case VDEV_LABEL_CREATE:
 			return (B_TRUE);
 
 		case VDEV_LABEL_REPLACE:
 			return (!spa_has_l2cache(spa, device_guid));
 
 		case VDEV_LABEL_L2CACHE:
 			return (spa_has_l2cache(spa, device_guid));
 		default:
 			break;
 		}
 	}
 
 	/*
 	 * We can't rely on a pool's state if it's been imported
 	 * read-only.  Instead we look to see if the pools is marked
 	 * read-only in the namespace and set the state to active.
 	 */
 	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
 	    (spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
 	    spa_mode(spa) == SPA_MODE_READ)
 		state = POOL_STATE_ACTIVE;
 
 	/*
 	 * If the device is marked ACTIVE, then this device is in use by another
 	 * pool on the system.
 	 */
 	return (state == POOL_STATE_ACTIVE);
 }
 
 static nvlist_t *
 vdev_aux_label_generate(vdev_t *vd, boolean_t reason_spare)
 {
 	/*
 	 * For inactive hot spares and level 2 ARC devices, we generate
 	 * a special label that identifies as a mutually shared hot
 	 * spare or l2cache device. We write the label in case of
 	 * addition or removal of hot spare or l2cache vdev (in which
 	 * case we want to revert the labels).
 	 */
 	nvlist_t *label = fnvlist_alloc();
 	fnvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
 	    spa_version(vd->vdev_spa));
 	fnvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, reason_spare ?
 	    POOL_STATE_SPARE : POOL_STATE_L2CACHE);
 	fnvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid);
 
 	/*
 	 * This is merely to facilitate reporting the ashift of the
 	 * cache device through zdb. The actual retrieval of the
 	 * ashift (in vdev_alloc()) uses the nvlist
 	 * spa->spa_l2cache->sav_config (populated in
 	 * spa_ld_open_aux_vdevs()).
 	 */
 	if (!reason_spare)
 		fnvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
 
 	/*
 	 * Add path information to help find it during pool import
 	 */
 	if (vd->vdev_path != NULL)
 		fnvlist_add_string(label, ZPOOL_CONFIG_PATH, vd->vdev_path);
 	if (vd->vdev_devid != NULL)
 		fnvlist_add_string(label, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
 	if (vd->vdev_physpath != NULL) {
 		fnvlist_add_string(label, ZPOOL_CONFIG_PHYS_PATH,
 		    vd->vdev_physpath);
 	}
 	return (label);
 }
 
 /*
  * Initialize a vdev label.  We check to make sure each leaf device is not in
  * use, and writable.  We put down an initial label which we will later
  * overwrite with a complete label.  Note that it's important to do this
  * sequentially, not in parallel, so that we catch cases of multiple use of the
  * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with
  * itself.
  */
 int
 vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *label;
 	vdev_phys_t *vp;
 	abd_t *vp_abd;
 	abd_t *bootenv;
 	uberblock_t *ub;
 	abd_t *ub_abd;
 	zio_t *zio;
 	char *buf;
 	size_t buflen;
 	int error;
 	uint64_t spare_guid = 0, l2cache_guid = 0;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 	boolean_t reason_spare = (reason == VDEV_LABEL_SPARE || (reason ==
 	    VDEV_LABEL_REMOVE && vd->vdev_isspare));
 	boolean_t reason_l2cache = (reason == VDEV_LABEL_L2CACHE || (reason ==
 	    VDEV_LABEL_REMOVE && vd->vdev_isl2cache));
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if ((error = vdev_label_init(vd->vdev_child[c],
 		    crtxg, reason)) != 0)
 			return (error);
 
 	/* Track the creation time for this vdev */
 	vd->vdev_crtxg = crtxg;
 
 	if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa))
 		return (0);
 
 	/*
 	 * Dead vdevs cannot be initialized.
 	 */
 	if (vdev_is_dead(vd))
 		return (SET_ERROR(EIO));
 
 	/*
 	 * Determine if the vdev is in use.
 	 */
 	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
 	    vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * If this is a request to add or replace a spare or l2cache device
 	 * that is in use elsewhere on the system, then we must update the
 	 * guid (which was initialized to a random value) to reflect the
 	 * actual GUID (which is shared between multiple pools).
 	 */
 	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE &&
 	    spare_guid != 0ULL) {
 		uint64_t guid_delta = spare_guid - vd->vdev_guid;
 
 		vd->vdev_guid += guid_delta;
 
 		for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
 			pvd->vdev_guid_sum += guid_delta;
 
 		/*
 		 * If this is a replacement, then we want to fallthrough to the
 		 * rest of the code.  If we're adding a spare, then it's already
 		 * labeled appropriately and we can just return.
 		 */
 		if (reason == VDEV_LABEL_SPARE)
 			return (0);
 		ASSERT(reason == VDEV_LABEL_REPLACE ||
 		    reason == VDEV_LABEL_SPLIT);
 	}
 
 	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
 	    l2cache_guid != 0ULL) {
 		uint64_t guid_delta = l2cache_guid - vd->vdev_guid;
 
 		vd->vdev_guid += guid_delta;
 
 		for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
 			pvd->vdev_guid_sum += guid_delta;
 
 		/*
 		 * If this is a replacement, then we want to fallthrough to the
 		 * rest of the code.  If we're adding an l2cache, then it's
 		 * already labeled appropriately and we can just return.
 		 */
 		if (reason == VDEV_LABEL_L2CACHE)
 			return (0);
 		ASSERT(reason == VDEV_LABEL_REPLACE);
 	}
 
 	/*
 	 * Initialize its label.
 	 */
 	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
 	abd_zero(vp_abd, sizeof (vdev_phys_t));
 	vp = abd_to_buf(vp_abd);
 
 	/*
 	 * Generate a label describing the pool and our top-level vdev.
 	 * We mark it as being from txg 0 to indicate that it's not
 	 * really part of an active pool just yet.  The labels will
 	 * be written again with a meaningful txg by spa_sync().
 	 */
 	if (reason_spare || reason_l2cache) {
 		label = vdev_aux_label_generate(vd, reason_spare);
 
 		/*
 		 * When spare or l2cache (aux) vdev is added during pool
 		 * creation, spa->spa_uberblock is not written until this
 		 * point. Write it on next config sync.
 		 */
 		if (uberblock_verify(&spa->spa_uberblock))
 			spa->spa_aux_sync_uber = B_TRUE;
 	} else {
 		uint64_t txg = 0ULL;
 
 		if (reason == VDEV_LABEL_SPLIT)
 			txg = spa->spa_uberblock.ub_txg;
 		label = spa_config_generate(spa, vd, txg, B_FALSE);
 
 		/*
 		 * Add our creation time.  This allows us to detect multiple
 		 * vdev uses as described above, and automatically expires if we
 		 * fail.
 		 */
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
 		    crtxg) == 0);
 	}
 
 	buf = vp->vp_nvlist;
 	buflen = sizeof (vp->vp_nvlist);
 
 	error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
 	if (error != 0) {
 		nvlist_free(label);
 		abd_free(vp_abd);
 		/* EFAULT means nvlist_pack ran out of room */
 		return (SET_ERROR(error == EFAULT ? ENAMETOOLONG : EINVAL));
 	}
 
 	/*
 	 * Initialize uberblock template.
 	 */
 	ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE);
 	abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t));
 	abd_zero_off(ub_abd, sizeof (uberblock_t),
 	    VDEV_UBERBLOCK_RING - sizeof (uberblock_t));
 	ub = abd_to_buf(ub_abd);
 	ub->ub_txg = 0;
 
 	/* Initialize the 2nd padding area. */
 	bootenv = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
 	abd_zero(bootenv, VDEV_PAD_SIZE);
 
 	/*
 	 * Write everything in parallel.
 	 */
 retry:
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (int l = 0; l < VDEV_LABELS; l++) {
 
 		vdev_label_write(zio, vd, l, vp_abd,
 		    offsetof(vdev_label_t, vl_vdev_phys),
 		    sizeof (vdev_phys_t), NULL, NULL, flags);
 
 		/*
 		 * Skip the 1st padding area.
 		 * Zero out the 2nd padding area where it might have
 		 * left over data from previous filesystem format.
 		 */
 		vdev_label_write(zio, vd, l, bootenv,
 		    offsetof(vdev_label_t, vl_be),
 		    VDEV_PAD_SIZE, NULL, NULL, flags);
 
 		vdev_label_write(zio, vd, l, ub_abd,
 		    offsetof(vdev_label_t, vl_uberblock),
 		    VDEV_UBERBLOCK_RING, NULL, NULL, flags);
 	}
 
 	error = zio_wait(zio);
 
 	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
 		flags |= ZIO_FLAG_TRYHARD;
 		goto retry;
 	}
 
 	nvlist_free(label);
 	abd_free(bootenv);
 	abd_free(ub_abd);
 	abd_free(vp_abd);
 
 	/*
 	 * If this vdev hasn't been previously identified as a spare, then we
 	 * mark it as such only if a) we are labeling it as a spare, or b) it
 	 * exists as a spare elsewhere in the system.  Do the same for
 	 * level 2 ARC devices.
 	 */
 	if (error == 0 && !vd->vdev_isspare &&
 	    (reason == VDEV_LABEL_SPARE ||
 	    spa_spare_exists(vd->vdev_guid, NULL, NULL)))
 		spa_spare_add(vd);
 
 	if (error == 0 && !vd->vdev_isl2cache &&
 	    (reason == VDEV_LABEL_L2CACHE ||
 	    spa_l2cache_exists(vd->vdev_guid, NULL)))
 		spa_l2cache_add(vd);
 
 	return (error);
 }
 
 /*
  * Done callback for vdev_label_read_bootenv_impl. If this is the first
  * callback to finish, store our abd in the callback pointer. Otherwise, we
  * just free our abd and return.
  */
 static void
 vdev_label_read_bootenv_done(zio_t *zio)
 {
 	zio_t *rio = zio->io_private;
 	abd_t **cbp = rio->io_private;
 
 	ASSERT3U(zio->io_size, ==, VDEV_PAD_SIZE);
 
 	if (zio->io_error == 0) {
 		mutex_enter(&rio->io_lock);
 		if (*cbp == NULL) {
 			/* Will free this buffer in vdev_label_read_bootenv. */
 			*cbp = zio->io_abd;
 		} else {
 			abd_free(zio->io_abd);
 		}
 		mutex_exit(&rio->io_lock);
 	} else {
 		abd_free(zio->io_abd);
 	}
 }
 
 static void
 vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_label_read_bootenv_impl(zio, vd->vdev_child[c], flags);
 
 	/*
 	 * We just use the first label that has a correct checksum; the
 	 * bootloader should have rewritten them all to be the same on boot,
 	 * and any changes we made since boot have been the same across all
 	 * labels.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
 		for (int l = 0; l < VDEV_LABELS; l++) {
 			vdev_label_read(zio, vd, l,
 			    abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE),
 			    offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE,
 			    vdev_label_read_bootenv_done, zio, flags);
 		}
 	}
 }
 
 int
 vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv)
 {
 	nvlist_t *config;
 	spa_t *spa = rvd->vdev_spa;
 	abd_t *abd = NULL;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
 
 	ASSERT(bootenv);
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	zio_t *zio = zio_root(spa, NULL, &abd, flags);
 	vdev_label_read_bootenv_impl(zio, rvd, flags);
 	int err = zio_wait(zio);
 
 	if (abd != NULL) {
 		char *buf;
 		vdev_boot_envblock_t *vbe = abd_to_buf(abd);
 
 		vbe->vbe_version = ntohll(vbe->vbe_version);
 		switch (vbe->vbe_version) {
 		case VB_RAW:
 			/*
 			 * if we have textual data in vbe_bootenv, create nvlist
 			 * with key "envmap".
 			 */
 			fnvlist_add_uint64(bootenv, BOOTENV_VERSION, VB_RAW);
 			vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0';
 			fnvlist_add_string(bootenv, GRUB_ENVMAP,
 			    vbe->vbe_bootenv);
 			break;
 
 		case VB_NVLIST:
 			err = nvlist_unpack(vbe->vbe_bootenv,
 			    sizeof (vbe->vbe_bootenv), &config, 0);
 			if (err == 0) {
 				fnvlist_merge(bootenv, config);
 				nvlist_free(config);
 				break;
 			}
 			zfs_fallthrough;
 		default:
 			/* Check for FreeBSD zfs bootonce command string */
 			buf = abd_to_buf(abd);
 			if (*buf == '\0') {
 				fnvlist_add_uint64(bootenv, BOOTENV_VERSION,
 				    VB_NVLIST);
 				break;
 			}
 			fnvlist_add_string(bootenv, FREEBSD_BOOTONCE, buf);
 		}
 
 		/*
 		 * abd was allocated in vdev_label_read_bootenv_impl()
 		 */
 		abd_free(abd);
 		/*
 		 * If we managed to read any successfully,
 		 * return success.
 		 */
 		return (0);
 	}
 	return (err);
 }
 
 int
 vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env)
 {
 	zio_t *zio;
 	spa_t *spa = vd->vdev_spa;
 	vdev_boot_envblock_t *bootenv;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 	int error;
 	size_t nvsize;
 	char *nvbuf;
 	const char *tmp;
 
 	error = nvlist_size(env, &nvsize, NV_ENCODE_XDR);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	if (nvsize >= sizeof (bootenv->vbe_bootenv)) {
 		return (SET_ERROR(E2BIG));
 	}
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	error = ENXIO;
 	for (int c = 0; c < vd->vdev_children; c++) {
 		int child_err;
 
 		child_err = vdev_label_write_bootenv(vd->vdev_child[c], env);
 		/*
 		 * As long as any of the disks managed to write all of their
 		 * labels successfully, return success.
 		 */
 		if (child_err == 0)
 			error = child_err;
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf || vdev_is_dead(vd) ||
 	    !vdev_writeable(vd)) {
 		return (error);
 	}
 	ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE);
 	abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
 	abd_zero(abd, VDEV_PAD_SIZE);
 
 	bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE);
 	nvbuf = bootenv->vbe_bootenv;
 	nvsize = sizeof (bootenv->vbe_bootenv);
 
 	bootenv->vbe_version = fnvlist_lookup_uint64(env, BOOTENV_VERSION);
 	switch (bootenv->vbe_version) {
 	case VB_RAW:
 		if (nvlist_lookup_string(env, GRUB_ENVMAP, &tmp) == 0) {
 			(void) strlcpy(bootenv->vbe_bootenv, tmp, nvsize);
 		}
 		error = 0;
 		break;
 
 	case VB_NVLIST:
 		error = nvlist_pack(env, &nvbuf, &nvsize, NV_ENCODE_XDR,
 		    KM_SLEEP);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	if (error == 0) {
 		bootenv->vbe_version = htonll(bootenv->vbe_version);
 		abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE);
 	} else {
 		abd_free(abd);
 		return (SET_ERROR(error));
 	}
 
 retry:
 	zio = zio_root(spa, NULL, NULL, flags);
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		vdev_label_write(zio, vd, l, abd,
 		    offsetof(vdev_label_t, vl_be),
 		    VDEV_PAD_SIZE, NULL, NULL, flags);
 	}
 
 	error = zio_wait(zio);
 	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
 		flags |= ZIO_FLAG_TRYHARD;
 		goto retry;
 	}
 
 	abd_free(abd);
 	return (error);
 }
 
 /*
  * ==========================================================================
  * uberblock load/sync
  * ==========================================================================
  */
 
 /*
  * Consider the following situation: txg is safely synced to disk.  We've
  * written the first uberblock for txg + 1, and then we lose power.  When we
  * come back up, we fail to see the uberblock for txg + 1 because, say,
  * it was on a mirrored device and the replica to which we wrote txg + 1
  * is now offline.  If we then make some changes and sync txg + 1, and then
  * the missing replica comes back, then for a few seconds we'll have two
  * conflicting uberblocks on disk with the same txg.  The solution is simple:
  * among uberblocks with equal txg, choose the one with the latest timestamp.
  */
 static int
 vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
 {
 	int cmp = TREE_CMP(ub1->ub_txg, ub2->ub_txg);
 
 	if (likely(cmp))
 		return (cmp);
 
 	cmp = TREE_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
 	if (likely(cmp))
 		return (cmp);
 
 	/*
 	 * If MMP_VALID(ub) && MMP_SEQ_VALID(ub) then the host has an MMP-aware
 	 * ZFS, e.g. OpenZFS >= 0.7.
 	 *
 	 * If one ub has MMP and the other does not, they were written by
 	 * different hosts, which matters for MMP.  So we treat no MMP/no SEQ as
 	 * a 0 value.
 	 *
 	 * Since timestamp and txg are the same if we get this far, either is
 	 * acceptable for importing the pool.
 	 */
 	unsigned int seq1 = 0;
 	unsigned int seq2 = 0;
 
 	if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
 		seq1 = MMP_SEQ(ub1);
 
 	if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
 		seq2 = MMP_SEQ(ub2);
 
 	return (TREE_CMP(seq1, seq2));
 }
 
 struct ubl_cbdata {
 	uberblock_t	ubl_latest;	/* Most recent uberblock */
 	uberblock_t	*ubl_ubbest;	/* Best uberblock (w/r/t max_txg) */
 	vdev_t		*ubl_vd;	/* vdev associated with the above */
 };
 
 static void
 vdev_uberblock_load_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	spa_t *spa = zio->io_spa;
 	zio_t *rio = zio->io_private;
 	uberblock_t *ub = abd_to_buf(zio->io_abd);
 	struct ubl_cbdata *cbp = rio->io_private;
 
 	ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
 
 	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
 		mutex_enter(&rio->io_lock);
 		if (vdev_uberblock_compare(ub, &cbp->ubl_latest) > 0) {
 			cbp->ubl_latest = *ub;
 		}
 		if (ub->ub_txg <= spa->spa_load_max_txg &&
 		    vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
 			/*
 			 * Keep track of the vdev in which this uberblock
 			 * was found. We will use this information later
 			 * to obtain the config nvlist associated with
 			 * this uberblock.
 			 */
 			*cbp->ubl_ubbest = *ub;
 			cbp->ubl_vd = vd;
 		}
 		mutex_exit(&rio->io_lock);
 	}
 
 	abd_free(zio->io_abd);
 }
 
 static void
 vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
     struct ubl_cbdata *cbp)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
 
 	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) &&
 	    vd->vdev_ops != &vdev_draid_spare_ops) {
 		for (int l = 0; l < VDEV_LABELS; l++) {
 			for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
 				vdev_label_read(zio, vd, l,
 				    abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd),
 				    B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n),
 				    VDEV_UBERBLOCK_SIZE(vd),
 				    vdev_uberblock_load_done, zio, flags);
 			}
 		}
 	}
 }
 
 /*
  * Reads the 'best' uberblock from disk along with its associated
  * configuration. First, we read the uberblock array of each label of each
  * vdev, keeping track of the uberblock with the highest txg in each array.
  * Then, we read the configuration from the same vdev as the best uberblock.
  */
 void
 vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
 {
 	zio_t *zio;
 	spa_t *spa = rvd->vdev_spa;
 	struct ubl_cbdata cb;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
 
 	ASSERT(ub);
 	ASSERT(config);
 
 	memset(ub, 0, sizeof (uberblock_t));
 	memset(&cb, 0, sizeof (cb));
 	*config = NULL;
 
 	cb.ubl_ubbest = ub;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	zio = zio_root(spa, NULL, &cb, flags);
 	vdev_uberblock_load_impl(zio, rvd, flags, &cb);
 	(void) zio_wait(zio);
 
 	/*
 	 * It's possible that the best uberblock was discovered on a label
 	 * that has a configuration which was written in a future txg.
 	 * Search all labels on this vdev to find the configuration that
 	 * matches the txg for our uberblock.
 	 */
 	if (cb.ubl_vd != NULL) {
 		vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. "
 		    "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg);
 
 		if (ub->ub_raidz_reflow_info !=
 		    cb.ubl_latest.ub_raidz_reflow_info) {
 			vdev_dbgmsg(cb.ubl_vd,
 			    "spa=%s best uberblock (txg=%llu info=0x%llx) "
 			    "has different raidz_reflow_info than latest "
 			    "uberblock (txg=%llu info=0x%llx)",
 			    spa->spa_name,
 			    (u_longlong_t)ub->ub_txg,
 			    (u_longlong_t)ub->ub_raidz_reflow_info,
 			    (u_longlong_t)cb.ubl_latest.ub_txg,
 			    (u_longlong_t)cb.ubl_latest.ub_raidz_reflow_info);
 			memset(ub, 0, sizeof (uberblock_t));
 			spa_config_exit(spa, SCL_ALL, FTAG);
 			return;
 		}
 
 		*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
 		if (*config == NULL && spa->spa_extreme_rewind) {
 			vdev_dbgmsg(cb.ubl_vd, "failed to read label config. "
 			    "Trying again without txg restrictions.");
 			*config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX);
 		}
 		if (*config == NULL) {
 			vdev_dbgmsg(cb.ubl_vd, "failed to read label config");
 		}
 	}
 	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
  * For use when a leaf vdev is expanded.
  * The location of labels 2 and 3 changed, and at the new location the
  * uberblock rings are either empty or contain garbage.  The sync will write
  * new configs there because the vdev is dirty, but expansion also needs the
  * uberblock rings copied.  Read them from label 0 which did not move.
  *
  * Since the point is to populate labels {2,3} with valid uberblocks,
  * we zero uberblocks we fail to read or which are not valid.
  */
 
 static void
 vdev_copy_uberblocks(vdev_t *vd)
 {
 	abd_t *ub_abd;
 	zio_t *write_zio;
 	int locks = (SCL_L2ARC | SCL_ZIO);
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SPECULATIVE;
 
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_READER) ==
 	    SCL_STATE);
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	/*
 	 * No uberblocks are stored on distributed spares, they may be
 	 * safely skipped when expanding a leaf vdev.
 	 */
 	if (vd->vdev_ops == &vdev_draid_spare_ops)
 		return;
 
 	spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER);
 
 	ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
 
 	write_zio = zio_root(vd->vdev_spa, NULL, NULL, flags);
 	for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
 		const int src_label = 0;
 		zio_t *zio;
 
 		zio = zio_root(vd->vdev_spa, NULL, NULL, flags);
 		vdev_label_read(zio, vd, src_label, ub_abd,
 		    VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
 		    NULL, NULL, flags);
 
 		if (zio_wait(zio) || uberblock_verify(abd_to_buf(ub_abd)))
 			abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
 
 		for (int l = 2; l < VDEV_LABELS; l++)
 			vdev_label_write(write_zio, vd, l, ub_abd,
 			    VDEV_UBERBLOCK_OFFSET(vd, n),
 			    VDEV_UBERBLOCK_SIZE(vd), NULL, NULL,
 			    flags | ZIO_FLAG_DONT_PROPAGATE);
 	}
 	(void) zio_wait(write_zio);
 
 	spa_config_exit(vd->vdev_spa, locks, FTAG);
 
 	abd_free(ub_abd);
 }
 
 /*
  * On success, increment root zio's count of good writes.
  * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
  */
 static void
 vdev_uberblock_sync_done(zio_t *zio)
 {
 	uint64_t *good_writes = zio->io_private;
 
 	if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
 		atomic_inc_64(good_writes);
 }
 
 /*
  * Write the uberblock to all labels of all leaves of the specified vdev.
  */
 static void
 vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
     uberblock_t *ub, vdev_t *vd, int flags)
 {
 	for (uint64_t c = 0; c < vd->vdev_children; c++) {
 		vdev_uberblock_sync(zio, good_writes,
 		    ub, vd->vdev_child[c], flags);
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return;
 
 	if (!vdev_writeable(vd))
 		return;
 
 	/*
 	 * There's no need to write uberblocks to a distributed spare, they
 	 * are already stored on all the leaves of the parent dRAID.  For
 	 * this same reason vdev_uberblock_load_impl() skips distributed
 	 * spares when reading uberblocks.
 	 */
 	if (vd->vdev_ops == &vdev_draid_spare_ops)
 		return;
 
 	/* If the vdev was expanded, need to copy uberblock rings. */
 	if (vd->vdev_state == VDEV_STATE_HEALTHY &&
 	    vd->vdev_copy_uberblocks == B_TRUE) {
 		vdev_copy_uberblocks(vd);
 		vd->vdev_copy_uberblocks = B_FALSE;
 	}
 
 	/*
 	 * We chose a slot based on the txg.  If this uberblock has a special
 	 * RAIDZ expansion state, then it is essentially an update of the
 	 * current uberblock (it has the same txg).  However, the current
 	 * state is committed, so we want to write it to a different slot. If
 	 * we overwrote the same slot, and we lose power during the uberblock
 	 * write, and the disk does not do single-sector overwrites
 	 * atomically (even though it is required to - i.e. we should see
 	 * either the old or the new uberblock), then we could lose this
 	 * txg's uberblock. Rewinding to the previous txg's uberblock may not
 	 * be possible because RAIDZ expansion may have already overwritten
 	 * some of the data, so we need the progress indicator in the
 	 * uberblock.
 	 */
 	int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0;
 	int n = (ub->ub_txg - (RRSS_GET_STATE(ub) == RRSS_SCRATCH_VALID)) %
 	    (VDEV_UBERBLOCK_COUNT(vd) - m);
 
 	/* Copy the uberblock_t into the ABD */
 	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
 	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
 	abd_zero_off(ub_abd, sizeof (uberblock_t),
 	    VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t));
 
 	for (int l = 0; l < VDEV_LABELS; l++)
 		vdev_label_write(zio, vd, l, ub_abd,
 		    VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
 		    vdev_uberblock_sync_done, good_writes,
 		    flags | ZIO_FLAG_DONT_PROPAGATE);
 
 	abd_free(ub_abd);
 }
 
 /* Sync the uberblocks to all vdevs in svd[] */
 int
 vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
 {
 	spa_t *spa = svd[0]->vdev_spa;
 	zio_t *zio;
 	uint64_t good_writes = 0;
 
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (int v = 0; v < svdcount; v++)
 		vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags);
 
 	if (spa->spa_aux_sync_uber) {
 		for (int v = 0; v < spa->spa_spares.sav_count; v++) {
 			vdev_uberblock_sync(zio, &good_writes, ub,
 			    spa->spa_spares.sav_vdevs[v], flags);
 		}
 		for (int v = 0; v < spa->spa_l2cache.sav_count; v++) {
 			vdev_uberblock_sync(zio, &good_writes, ub,
 			    spa->spa_l2cache.sav_vdevs[v], flags);
 		}
 	}
 	(void) zio_wait(zio);
 
 	/*
 	 * Flush the uberblocks to disk.  This ensures that the odd labels
 	 * are no longer needed (because the new uberblocks and the even
 	 * labels are safely on disk), so it is safe to overwrite them.
 	 */
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (int v = 0; v < svdcount; v++) {
 		if (vdev_writeable(svd[v])) {
 			zio_flush(zio, svd[v]);
 		}
 	}
 	if (spa->spa_aux_sync_uber) {
 		spa->spa_aux_sync_uber = B_FALSE;
 		for (int v = 0; v < spa->spa_spares.sav_count; v++) {
 			if (vdev_writeable(spa->spa_spares.sav_vdevs[v])) {
 				zio_flush(zio, spa->spa_spares.sav_vdevs[v]);
 			}
 		}
 		for (int v = 0; v < spa->spa_l2cache.sav_count; v++) {
 			if (vdev_writeable(spa->spa_l2cache.sav_vdevs[v])) {
 				zio_flush(zio, spa->spa_l2cache.sav_vdevs[v]);
 			}
 		}
 	}
 
 	(void) zio_wait(zio);
 
 	return (good_writes >= 1 ? 0 : EIO);
 }
 
 /*
  * On success, increment the count of good writes for our top-level vdev.
  */
 static void
 vdev_label_sync_done(zio_t *zio)
 {
 	uint64_t *good_writes = zio->io_private;
 
 	if (zio->io_error == 0)
 		atomic_inc_64(good_writes);
 }
 
 /*
  * If there weren't enough good writes, indicate failure to the parent.
  */
 static void
 vdev_label_sync_top_done(zio_t *zio)
 {
 	uint64_t *good_writes = zio->io_private;
 
 	if (*good_writes == 0)
 		zio->io_error = SET_ERROR(EIO);
 
 	kmem_free(good_writes, sizeof (uint64_t));
 }
 
 /*
  * We ignore errors for log and cache devices, simply free the private data.
  */
 static void
 vdev_label_sync_ignore_done(zio_t *zio)
 {
 	kmem_free(zio->io_private, sizeof (uint64_t));
 }
 
 /*
  * Write all even or odd labels to all leaves of the specified vdev.
  */
 static void
 vdev_label_sync(zio_t *zio, uint64_t *good_writes,
     vdev_t *vd, int l, uint64_t txg, int flags)
 {
 	nvlist_t *label;
 	vdev_phys_t *vp;
 	abd_t *vp_abd;
 	char *buf;
 	size_t buflen;
 	vdev_t *pvd = vd->vdev_parent;
 	boolean_t spare_in_use = B_FALSE;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_label_sync(zio, good_writes,
 		    vd->vdev_child[c], l, txg, flags);
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return;
 
 	if (!vdev_writeable(vd))
 		return;
 
 	/*
 	 * The top-level config never needs to be written to a distributed
 	 * spare.  When read vdev_dspare_label_read_config() will generate
 	 * the config for the vdev_label_read_config().
 	 */
 	if (vd->vdev_ops == &vdev_draid_spare_ops)
 		return;
 
 	if (pvd && pvd->vdev_ops == &vdev_spare_ops)
 		spare_in_use = B_TRUE;
 
 	/*
 	 * Generate a label describing the top-level config to which we belong.
 	 */
 	if ((vd->vdev_isspare && !spare_in_use) || vd->vdev_isl2cache) {
 		label = vdev_aux_label_generate(vd, vd->vdev_isspare);
 	} else {
 		label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
 	}
 
 	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
 	abd_zero(vp_abd, sizeof (vdev_phys_t));
 	vp = abd_to_buf(vp_abd);
 
 	buf = vp->vp_nvlist;
 	buflen = sizeof (vp->vp_nvlist);
 
 	if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) {
 		for (; l < VDEV_LABELS; l += 2) {
 			vdev_label_write(zio, vd, l, vp_abd,
 			    offsetof(vdev_label_t, vl_vdev_phys),
 			    sizeof (vdev_phys_t),
 			    vdev_label_sync_done, good_writes,
 			    flags | ZIO_FLAG_DONT_PROPAGATE);
 		}
 	}
 
 	abd_free(vp_abd);
 	nvlist_free(label);
 }
 
 static int
 vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
 {
 	list_t *dl = &spa->spa_config_dirty_list;
 	vdev_t *vd;
 	zio_t *zio;
 	int error;
 
 	/*
 	 * Write the new labels to disk.
 	 */
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
 		uint64_t *good_writes;
 
 		ASSERT(!vd->vdev_ishole);
 
 		good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
 		zio_t *vio = zio_null(zio, spa, NULL,
 		    (vd->vdev_islog || vd->vdev_aux != NULL) ?
 		    vdev_label_sync_ignore_done : vdev_label_sync_top_done,
 		    good_writes, flags);
 		vdev_label_sync(vio, good_writes, vd, l, txg, flags);
 		zio_nowait(vio);
 	}
 
 	/*
 	 * AUX path may have changed during import
 	 */
 	spa_aux_vdev_t *sav[2] = {&spa->spa_spares, &spa->spa_l2cache};
 	for (int i = 0; i < 2; i++) {
 		for (int v = 0; v < sav[i]->sav_count; v++) {
 			uint64_t *good_writes;
 			if (!sav[i]->sav_label_sync)
 				continue;
 			good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
 			zio_t *vio = zio_null(zio, spa, NULL,
 			    vdev_label_sync_ignore_done, good_writes, flags);
 			vdev_label_sync(vio, good_writes, sav[i]->sav_vdevs[v],
 			    l, txg, flags);
 			zio_nowait(vio);
 		}
 	}
 
 	error = zio_wait(zio);
 
 	/*
 	 * Flush the new labels to disk.
 	 */
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd))
 		zio_flush(zio, vd);
 
 	for (int i = 0; i < 2; i++) {
 		if (!sav[i]->sav_label_sync)
 			continue;
 		for (int v = 0; v < sav[i]->sav_count; v++)
 			zio_flush(zio, sav[i]->sav_vdevs[v]);
 		if (l == 1)
 			sav[i]->sav_label_sync = B_FALSE;
 	}
 
 	(void) zio_wait(zio);
 
 	return (error);
 }
 
 /*
  * Sync the uberblock and any changes to the vdev configuration.
  *
  * The order of operations is carefully crafted to ensure that
  * if the system panics or loses power at any time, the state on disk
  * is still transactionally consistent.  The in-line comments below
  * describe the failure semantics at each stage.
  *
  * Moreover, vdev_config_sync() is designed to be idempotent: if it fails
  * at any time, you can just call it again, and it will resume its work.
  */
 int
 vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
 {
 	spa_t *spa = svd[0]->vdev_spa;
 	uberblock_t *ub = &spa->spa_uberblock;
 	int error = 0;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 
 	ASSERT(svdcount != 0);
 retry:
 	/*
 	 * Normally, we don't want to try too hard to write every label and
 	 * uberblock.  If there is a flaky disk, we don't want the rest of the
 	 * sync process to block while we retry.  But if we can't write a
 	 * single label out, we should retry with ZIO_FLAG_TRYHARD before
 	 * bailing out and declaring the pool faulted.
 	 */
 	if (error != 0) {
 		if ((flags & ZIO_FLAG_TRYHARD) != 0)
 			return (error);
 		flags |= ZIO_FLAG_TRYHARD;
 	}
 
 	ASSERT(ub->ub_txg <= txg);
 
 	/*
 	 * If this isn't a resync due to I/O errors,
 	 * and nothing changed in this transaction group,
 	 * and multihost protection isn't enabled,
 	 * and the vdev configuration hasn't changed,
 	 * then there's nothing to do.
 	 */
 	if (ub->ub_txg < txg) {
 		boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
 		    txg, spa->spa_mmp.mmp_delay);
 
 		if (!changed && list_is_empty(&spa->spa_config_dirty_list) &&
 		    !spa_multihost(spa))
 			return (0);
 	}
 
 	if (txg > spa_freeze_txg(spa))
 		return (0);
 
 	ASSERT(txg <= spa->spa_final_txg);
 
 	/*
 	 * Flush the write cache of every disk that's been written to
 	 * in this transaction group.  This ensures that all blocks
 	 * written in this txg will be committed to stable storage
 	 * before any uberblock that references them.
 	 */
 	zio_t *zio = zio_root(spa, NULL, NULL, flags);
 
 	for (vdev_t *vd =
 	    txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL;
 	    vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
 		zio_flush(zio, vd);
 
 	(void) zio_wait(zio);
 
 	/*
 	 * Sync out the even labels (L0, L2) for every dirty vdev.  If the
 	 * system dies in the middle of this process, that's OK: all of the
 	 * even labels that made it to disk will be newer than any uberblock,
 	 * and will therefore be considered invalid.  The odd labels (L1, L3),
 	 * which have not yet been touched, will still be valid.  We flush
 	 * the new labels to disk to ensure that all even-label updates
 	 * are committed to stable storage before the uberblock update.
 	 */
 	if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) {
 		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
 			zfs_dbgmsg("vdev_label_sync_list() returned error %d "
 			    "for pool '%s' when syncing out the even labels "
 			    "of dirty vdevs", error, spa_name(spa));
 		}
 		goto retry;
 	}
 
 	/*
 	 * Sync the uberblocks to all vdevs in svd[].
 	 * If the system dies in the middle of this step, there are two cases
 	 * to consider, and the on-disk state is consistent either way:
 	 *
 	 * (1)	If none of the new uberblocks made it to disk, then the
 	 *	previous uberblock will be the newest, and the odd labels
 	 *	(which had not yet been touched) will be valid with respect
 	 *	to that uberblock.
 	 *
 	 * (2)	If one or more new uberblocks made it to disk, then they
 	 *	will be the newest, and the even labels (which had all
 	 *	been successfully committed) will be valid with respect
 	 *	to the new uberblocks.
 	 */
 	if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) {
 		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
 			zfs_dbgmsg("vdev_uberblock_sync_list() returned error "
 			    "%d for pool '%s'", error, spa_name(spa));
 		}
 		goto retry;
 	}
 
 	if (spa_multihost(spa))
 		mmp_update_uberblock(spa, ub);
 
 	/*
 	 * Sync out odd labels for every dirty vdev.  If the system dies
 	 * in the middle of this process, the even labels and the new
 	 * uberblocks will suffice to open the pool.  The next time
 	 * the pool is opened, the first thing we'll do -- before any
 	 * user data is modified -- is mark every vdev dirty so that
 	 * all labels will be brought up to date.  We flush the new labels
 	 * to disk to ensure that all odd-label updates are committed to
 	 * stable storage before the next transaction group begins.
 	 */
 	if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) {
 		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
 			zfs_dbgmsg("vdev_label_sync_list() returned error %d "
 			    "for pool '%s' when syncing out the odd labels of "
 			    "dirty vdevs", error, spa_name(spa));
 		}
 		goto retry;
 	}
 
 	return (0);
 }
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 6bac2241c6d8..59225e766ba1 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -1,5123 +1,5123 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zap.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu_tx.h>
 #include <sys/abd.h>
 #include <sys/zfs_rlock.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_raidz_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/uberblock_impl.h>
 #include <sys/dsl_scan.h>
 
 #ifdef ZFS_DEBUG
 #include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
 #endif
 
 /*
  * Virtual device vector for RAID-Z.
  *
  * This vdev supports single, double, and triple parity. For single parity,
  * we use a simple XOR of all the data columns. For double or triple parity,
  * we use a special case of Reed-Solomon coding. This extends the
  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  * former is also based. The latter is designed to provide higher performance
  * for writes.
  *
  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  * amended six years later identifying a critical flaw that invalidates its
  * claims. Nevertheless, the technique can be adapted to work for up to
  * triple parity. For additional parity, the amendment "Note: Correction to
  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  * is viable, but the additional complexity means that write performance will
  * suffer.
  *
  * All of the methods above operate on a Galois field, defined over the
  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  * can be expressed with a single byte. Briefly, the operations on the
  * field are defined as follows:
  *
  *   o addition (+) is represented by a bitwise XOR
  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  *   o multiplication of A by 2 is defined by the following bitwise expression:
  *
  *	(A * 2)_7 = A_6
  *	(A * 2)_6 = A_5
  *	(A * 2)_5 = A_4
  *	(A * 2)_4 = A_3 + A_7
  *	(A * 2)_3 = A_2 + A_7
  *	(A * 2)_2 = A_1 + A_7
  *	(A * 2)_1 = A_0
  *	(A * 2)_0 = A_7
  *
  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  * As an aside, this multiplication is derived from the error correcting
  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  *
  * Observe that any number in the field (except for 0) can be expressed as a
  * power of 2 -- a generator for the field. We store a table of the powers of
  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  * than field addition). The inverse of a field element A (A^-1) is therefore
  * A ^ (255 - 1) = A^254.
  *
  * The up-to-three parity columns, P, Q, R over several data columns,
  * D_0, ... D_n-1, can be expressed by field operations:
  *
  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  *
  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
  * independent coefficients. (There are no additional coefficients that have
  * this property which is why the uncorrected Plank method breaks down.)
  *
  * See the reconstruction code below for how P, Q and R can used individually
  * or in concert to recover missing data columns.
  */
 
 #define	VDEV_RAIDZ_P		0
 #define	VDEV_RAIDZ_Q		1
 #define	VDEV_RAIDZ_R		2
 
 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 
 /*
  * We provide a mechanism to perform the field multiplication operation on a
  * 64-bit value all at once rather than a byte at a time. This works by
  * creating a mask from the top bit in each byte and using that to
  * conditionally apply the XOR of 0x1d.
  */
 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
 { \
 	(mask) = (x) & 0x8080808080808080ULL; \
 	(mask) = ((mask) << 1) - ((mask) >> 7); \
 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
 }
 
 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
 { \
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 }
 
 
 /*
  * Big Theory Statement for how a RAIDZ VDEV is expanded
  *
  * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
  * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
  * that have been previously expanded can be expanded again.
  *
  * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
  * the VDEV) when an expansion starts.  And the expansion will pause if any
  * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
  * operations on the pool can continue while an expansion is in progress (e.g.
  * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
  * and zpool initialize which can't be run during an expansion.  Following a
  * reboot or export/import, the expansion resumes where it left off.
  *
  * == Reflowing the Data ==
  *
  * The expansion involves reflowing (copying) the data from the current set
  * of disks to spread it across the new set which now has one more disk. This
  * reflow operation is similar to reflowing text when the column width of a
  * text editor window is expanded. The text doesn’t change but the location of
  * the text changes to accommodate the new width. An example reflow result for
  * a 4-wide RAIDZ1 to a 5-wide is shown below.
  *
  *                            Reflow End State
  *            Each letter indicates a parity group (logical stripe)
  *
  *         Before expansion                         After Expansion
  *     D1     D2     D3     D4               D1     D2     D3     D4     D5
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  A   |  A   |  A   |  A   |         |  A   |  A   |  A   |  A   |  B   |
  *  |     1|     2|     3|     4|         |     1|     2|     3|     4|     5|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  B   |  B   |  C   |  C   |         |  B   |  C   |  C   |  C   |  C   |
  *  |     5|     6|     7|     8|         |     6|     7|     8|     9|    10|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  C   |  C   |  D   |  D   |         |  D   |  D   |  E   |  E   |  E   |
  *  |     9|    10|    11|    12|         |    11|    12|    13|    14|    15|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  E   |  E   |  E   |  E   |   -->   |  E   |  F   |  F   |  G   |  G   |
  *  |    13|    14|    15|    16|         |    16|    17|    18|p   19|    20|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  F   |  F   |  G   |  G   |         |  G   |  G   |  H   |  H   |  H   |
  *  |    17|    18|    19|    20|         |    21|    22|    23|    24|    25|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  G   |  G   |  H   |  H   |         |  H   |  I   |  I   |  J   |  J   |
  *  |    21|    22|    23|    24|         |    26|    27|    28|    29|    30|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  H   |  H   |  I   |  I   |         |  J   |  J   |      |      |  K   |
  *  |    25|    26|    27|    28|         |    31|    32|    33|    34|    35|
  *  +------+------+------+------+         +------+------+------+------+------+
  *
  * This reflow approach has several advantages. There is no need to read or
  * modify the block pointers or recompute any block checksums.  The reflow
  * doesn’t need to know where the parity sectors reside. We can read and write
  * data sequentially and the copy can occur in a background thread in open
  * context. The design also allows for fast discovery of what data to copy.
  *
  * The VDEV metaslabs are processed, one at a time, to copy the block data to
  * have it flow across all the disks. The metaslab is disabled for allocations
  * during the copy. As an optimization, we only copy the allocated data which
  * can be determined by looking at the metaslab range tree. During the copy we
  * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
  * need to be able to survive losing parity count disks).  This means we
  * cannot overwrite data during the reflow that would be needed if a disk is
  * lost.
  *
  * After the reflow completes, all newly-written blocks will have the new
  * layout, i.e., they will have the parity to data ratio implied by the new
  * number of disks in the RAIDZ group.  Even though the reflow copies all of
  * the allocated space (data and parity), it is only rearranged, not changed.
  *
  * This act of reflowing the data has a few implications about blocks
  * that were written before the reflow completes:
  *
  *  - Old blocks will still use the same amount of space (i.e., they will have
  *    the parity to data ratio implied by the old number of disks in the RAIDZ
  *    group).
  *  - Reading old blocks will be slightly slower than before the reflow, for
  *    two reasons. First, we will have to read from all disks in the RAIDZ
  *    VDEV, rather than being able to skip the children that contain only
  *    parity of this block (because the data of a single block is now spread
  *    out across all the disks).  Second, in most cases there will be an extra
  *    bcopy, needed to rearrange the data back to its original layout in memory.
  *
  * == Scratch Area ==
  *
  * As we copy the block data, we can only progress to the point that writes
  * will not overlap with blocks whose progress has not yet been recorded on
  * disk.  Since partially-copied rows are always read from the old location,
  * we need to stop one row before the sector-wise overlap, to prevent any
  * row-wise overlap. For example, in the diagram above, when we reflow sector
  * B6 it will overwite the original location for B5.
  *
  * To get around this, a scratch space is used so that we can start copying
  * without risking data loss by overlapping the row. As an added benefit, it
  * improves performance at the beginning of the reflow, but that small perf
  * boost wouldn't be worth the complexity on its own.
  *
  * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
  * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
  * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
  * the widths will likely be single digits so we can get a substantial chuck
  * size using only a few MB of scratch per disk.
  *
  * The scratch area is persisted to disk which holds a large amount of reflowed
  * state. We can always read the partially written stripes when a disk fails or
  * the copy is interrupted (crash) during the initial copying phase and also
  * get past a small chunk size restriction.  At a minimum, the scratch space
  * must be large enough to get us to the point that one row does not overlap
  * itself when moved (i.e new_width^2).  But going larger is even better. We
  * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
  * as our scratch space to handle overwriting the initial part of the VDEV.
  *
  *	0     256K   512K                    4M
  *	+------+------+-----------------------+-----------------------------
  *	| VDEV | VDEV |   Boot Block (3.5M)   |  Allocatable space ...
  *	|  L0  |  L1  |       Reserved        |     (Metaslabs)
  *	+------+------+-----------------------+-------------------------------
  *                        Scratch Area
  *
  * == Reflow Progress Updates ==
  * After the initial scratch-based reflow, the expansion process works
  * similarly to device removal. We create a new open context thread which
  * reflows the data, and periodically kicks off sync tasks to update logical
  * state. In this case, state is the committed progress (offset of next data
  * to copy). We need to persist the completed offset on disk, so that if we
  * crash we know which format each VDEV offset is in.
  *
  * == Time Dependent Geometry ==
  *
  * In non-expanded RAIDZ, blocks are read from disk in a column by column
  * fashion. For a multi-row block, the second sector is in the first column
  * not in the second column. This allows us to issue full reads for each
  * column directly into the request buffer. The block data is thus laid out
  * sequentially in a column-by-column fashion.
  *
  * For example, in the before expansion diagram above, one logical block might
  * be sectors G19-H26. The parity is in G19,H23; and the data is in
  * G20,H24,G21,H25,G22,H26.
  *
  * After a block is reflowed, the sectors that were all in the original column
  * data can now reside in different columns. When reading from an expanded
  * VDEV, we need to know the logical stripe width for each block so we can
  * reconstitute the block’s data after the reads are completed. Likewise,
  * when we perform the combinatorial reconstruction we need to know the
  * original width so we can retry combinations from the past layouts.
  *
  * Time dependent geometry is what we call having blocks with different layouts
  * (stripe widths) in the same VDEV. This time-dependent geometry uses the
  * block’s birth time (+ the time expansion ended) to establish the correct
  * width for a given block. After an expansion completes, we record the time
  * for blocks written with a particular width (geometry).
  *
  * == On Disk Format Changes ==
  *
  * New pool feature flag, 'raidz_expansion' whose reference count is the number
  * of RAIDZ VDEVs that have been expanded.
  *
  * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
  *
  * Since the uberblock can point to arbitrary blocks, which might be on the
  * expanding RAIDZ, and might or might not have been expanded. We need to know
  * which way a block is laid out before reading it. This info is the next
  * offset that needs to be reflowed and we persist that in the uberblock, in
  * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
  * After the expansion is complete, we then use the raidz_expand_txgs array
  * (see below) to determine how to read a block and the ub_raidz_reflow_info
  * field no longer required.
  *
  * The uberblock's ub_raidz_reflow_info field also holds the scratch space
  * state (i.e., active or not) which is also required before reading a block
  * during the initial phase of reflowing the data.
  *
  * The top-level RAIDZ VDEV has two new entries in the nvlist:
  *
  * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
  *                            and used after the expansion is complete to
  *                            determine how to read a raidz block
  * 'raidz_expanding' boolean: present during reflow and removed after completion
  *                            used during a spa import to resume an unfinished
  *                            expansion
  *
  * And finally the VDEVs top zap adds the following informational entries:
  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
  */
 
 /*
  * For testing only: pause the raidz expansion after reflowing this amount.
  * (accessed by ZTS and ztest)
  */
 #ifdef	_KERNEL
 static
 #endif	/* _KERNEL */
 unsigned long raidz_expand_max_reflow_bytes = 0;
 
 /*
  * For testing only: pause the raidz expansion at a certain point.
  */
 uint_t raidz_expand_pause_point = 0;
 
 /*
  * Maximum amount of copy io's outstanding at once.
  */
 #ifdef _ILP32
 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE;
 #else
 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
 #endif
 
 /*
  * Apply raidz map abds aggregation if the number of rows in the map is equal
  * or greater than the value below.
  */
 static unsigned long raidz_io_aggregate_rows = 4;
 
 /*
  * Automatically start a pool scrub when a RAIDZ expansion completes in
  * order to verify the checksums of all blocks which have been copied
  * during the expansion.  Automatic scrubbing is enabled by default and
  * is strongly recommended.
  */
 static int zfs_scrub_after_expand = 1;
 
 static void
 vdev_raidz_row_free(raidz_row_t *rr)
 {
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_size != 0)
 			abd_free(rc->rc_abd);
 		if (rc->rc_orig_data != NULL)
 			abd_free(rc->rc_orig_data);
 	}
 
 	if (rr->rr_abd_empty != NULL)
 		abd_free(rr->rr_abd_empty);
 
 	kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
 }
 
 void
 vdev_raidz_map_free(raidz_map_t *rm)
 {
 	for (int i = 0; i < rm->rm_nrows; i++)
 		vdev_raidz_row_free(rm->rm_row[i]);
 
 	if (rm->rm_nphys_cols) {
 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
 			if (rm->rm_phys_col[i].rc_abd != NULL)
 				abd_free(rm->rm_phys_col[i].rc_abd);
 		}
 
 		kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
 		    rm->rm_nphys_cols);
 	}
 
 	ASSERT3P(rm->rm_lr, ==, NULL);
 	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
 }
 
 static void
 vdev_raidz_map_free_vsd(zio_t *zio)
 {
 	raidz_map_t *rm = zio->io_vsd;
 
 	vdev_raidz_map_free(rm);
 }
 
 static int
 vdev_raidz_reflow_compare(const void *x1, const void *x2)
 {
 	const reflow_node_t *l = x1;
 	const reflow_node_t *r = x2;
 
 	return (TREE_CMP(l->re_txg, r->re_txg));
 }
 
 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 	.vsd_free = vdev_raidz_map_free_vsd,
 };
 
 raidz_row_t *
 vdev_raidz_row_alloc(int cols, zio_t *zio)
 {
 	raidz_row_t *rr =
 	    kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
 
 	rr->rr_cols = cols;
 	rr->rr_scols = cols;
 
 	for (int c = 0; c < cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		rc->rc_shadow_devidx = INT_MAX;
 		rc->rc_shadow_offset = UINT64_MAX;
 		/*
 		 * We can not allow self healing to take place for Direct I/O
 		 * reads. There is nothing that stops the buffer contents from
 		 * being manipulated while the I/O is in flight. It is possible
 		 * that the checksum could be verified on the buffer and then
 		 * the contents of that buffer are manipulated afterwards. This
 		 * could lead to bad data being written out during self
 		 * healing.
 		 */
 		if (!(zio->io_flags & ZIO_FLAG_DIO_READ))
 			rc->rc_allow_repair = 1;
 	}
 	return (rr);
 }
 
 static void
 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
 {
 	int c;
 	int nwrapped = 0;
 	uint64_t off = 0;
 	raidz_row_t *rr = rm->rm_row[0];
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 	ASSERT3U(rm->rm_nrows, ==, 1);
 
 	/*
 	 * Pad any parity columns with additional space to account for skip
 	 * sectors.
 	 */
 	if (rm->rm_skipstart < rr->rr_firstdatacol) {
 		ASSERT0(rm->rm_skipstart);
 		nwrapped = rm->rm_nskip;
 	} else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
 		nwrapped =
 		    (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
 	}
 
 	/*
 	 * Optional single skip sectors (rc_size == 0) will be handled in
 	 * vdev_raidz_io_start_write().
 	 */
 	int skipped = rr->rr_scols - rr->rr_cols;
 
 	/* Allocate buffers for the parity columns */
 	for (c = 0; c < rr->rr_firstdatacol; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		/*
 		 * Parity columns will pad out a linear ABD to account for
 		 * the skip sector. A linear ABD is used here because
 		 * parity calculations use the ABD buffer directly to calculate
 		 * parity. This avoids doing a memcpy back to the ABD after the
 		 * parity has been calculated. By issuing the parity column
 		 * with the skip sector we can reduce contention on the child
 		 * VDEV queue locks (vq_lock).
 		 */
 		if (c < nwrapped) {
 			rc->rc_abd = abd_alloc_linear(
 			    rc->rc_size + (1ULL << ashift), B_FALSE);
 			abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
 			skipped++;
 		} else {
 			rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
 		}
 	}
 
 	for (off = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
 		    zio->io_abd, off, rc->rc_size);
 
 		/*
 		 * Generate I/O for skip sectors to improve aggregation
 		 * continuity. We will use gang ABD's to reduce contention
 		 * on the child VDEV queue locks (vq_lock) by issuing
 		 * a single I/O that contains the data and skip sector.
 		 *
 		 * It is important to make sure that rc_size is not updated
 		 * even though we are adding a skip sector to the ABD. When
 		 * calculating the parity in vdev_raidz_generate_parity_row()
 		 * the rc_size is used to iterate through the ABD's. We can
 		 * not have zero'd out skip sectors used for calculating
 		 * parity for raidz, because those same sectors are not used
 		 * during reconstruction.
 		 */
 		if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
 			rc->rc_abd = abd_alloc_gang();
 			abd_gang_add(rc->rc_abd, abd, B_TRUE);
 			abd_gang_add(rc->rc_abd,
 			    abd_get_zeros(1ULL << ashift), B_TRUE);
 			skipped++;
 		} else {
 			rc->rc_abd = abd;
 		}
 		off += rc->rc_size;
 	}
 
 	ASSERT3U(off, ==, zio->io_size);
 	ASSERT3S(skipped, ==, rm->rm_nskip);
 }
 
 static void
 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
 {
 	int c;
 	raidz_row_t *rr = rm->rm_row[0];
 
 	ASSERT3U(rm->rm_nrows, ==, 1);
 
 	/* Allocate buffers for the parity columns */
 	for (c = 0; c < rr->rr_firstdatacol; c++)
 		rr->rr_col[c].rc_abd =
 		    abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
 
 	for (uint64_t off = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
 		    zio->io_abd, off, rc->rc_size);
 		off += rc->rc_size;
 	}
 }
 
 /*
  * Divides the IO evenly across all child vdevs; usually, dcols is
  * the number of children in the target vdev.
  *
  * Avoid inlining the function to keep vdev_raidz_io_start(), which
  * is this functions only caller, as small as possible on the stack.
  */
 noinline raidz_map_t *
 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
     uint64_t nparity)
 {
 	raidz_row_t *rr;
 	/* The starting RAIDZ (parent) vdev sector of the block. */
 	uint64_t b = zio->io_offset >> ashift;
 	/* The zio's size in units of the vdev's minimum sector size. */
 	uint64_t s = zio->io_size >> ashift;
 	/* The first column for this stripe. */
 	uint64_t f = b % dcols;
 	/* The starting byte offset on each child vdev. */
 	uint64_t o = (b / dcols) << ashift;
 	uint64_t acols, scols;
 
 	raidz_map_t *rm =
 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
 	rm->rm_nrows = 1;
 
 	/*
 	 * "Quotient": The number of data sectors for this stripe on all but
 	 * the "big column" child vdevs that also contain "remainder" data.
 	 */
 	uint64_t q = s / (dcols - nparity);
 
 	/*
 	 * "Remainder": The number of partial stripe data sectors in this I/O.
 	 * This will add a sector to some, but not all, child vdevs.
 	 */
 	uint64_t r = s - q * (dcols - nparity);
 
 	/* The number of "big columns" - those which contain remainder data. */
 	uint64_t bc = (r == 0 ? 0 : r + nparity);
 
 	/*
 	 * The total number of data and parity sectors associated with
 	 * this I/O.
 	 */
 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
 
 	/*
 	 * acols: The columns that will be accessed.
 	 * scols: The columns that will be accessed or skipped.
 	 */
 	if (q == 0) {
 		/* Our I/O request doesn't span all child vdevs. */
 		acols = bc;
 		scols = MIN(dcols, roundup(bc, nparity + 1));
 	} else {
 		acols = dcols;
 		scols = dcols;
 	}
 
 	ASSERT3U(acols, <=, scols);
 	rr = vdev_raidz_row_alloc(scols, zio);
 	rm->rm_row[0] = rr;
 	rr->rr_cols = acols;
 	rr->rr_bigcols = bc;
 	rr->rr_firstdatacol = nparity;
 #ifdef ZFS_DEBUG
 	rr->rr_offset = zio->io_offset;
 	rr->rr_size = zio->io_size;
 #endif
 
 	uint64_t asize = 0;
 
 	for (uint64_t c = 0; c < scols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		uint64_t col = f + c;
 		uint64_t coff = o;
 		if (col >= dcols) {
 			col -= dcols;
 			coff += 1ULL << ashift;
 		}
 		rc->rc_devidx = col;
 		rc->rc_offset = coff;
 
 		if (c >= acols)
 			rc->rc_size = 0;
 		else if (c < bc)
 			rc->rc_size = (q + 1) << ashift;
 		else
 			rc->rc_size = q << ashift;
 
 		asize += rc->rc_size;
 	}
 
 	ASSERT3U(asize, ==, tot << ashift);
 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 	rm->rm_skipstart = bc;
 
 	/*
 	 * If all data stored spans all columns, there's a danger that parity
 	 * will always be on the same device and, since parity isn't read
 	 * during normal operation, that device's I/O bandwidth won't be
 	 * used effectively. We therefore switch the parity every 1MB.
 	 *
 	 * ... at least that was, ostensibly, the theory. As a practical
 	 * matter unless we juggle the parity between all devices evenly, we
 	 * won't see any benefit. Further, occasional writes that aren't a
 	 * multiple of the LCM of the number of children and the minimum
 	 * stripe width are sufficient to avoid pessimal behavior.
 	 * Unfortunately, this decision created an implicit on-disk format
 	 * requirement that we need to support for all eternity, but only
 	 * for single-parity RAID-Z.
 	 *
 	 * If we intend to skip a sector in the zeroth column for padding
 	 * we must make sure to note this swap. We will never intend to
 	 * skip the first column since at least one data and one parity
 	 * column must appear in each row.
 	 */
 	ASSERT(rr->rr_cols >= 2);
 	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
 
 	if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
 		uint64_t devidx = rr->rr_col[0].rc_devidx;
 		o = rr->rr_col[0].rc_offset;
 		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
 		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
 		rr->rr_col[1].rc_devidx = devidx;
 		rr->rr_col[1].rc_offset = o;
 		if (rm->rm_skipstart == 0)
 			rm->rm_skipstart = 1;
 	}
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		vdev_raidz_map_alloc_write(zio, rm, ashift);
 	} else {
 		vdev_raidz_map_alloc_read(zio, rm);
 	}
 	/* init RAIDZ parity ops */
 	rm->rm_ops = vdev_raidz_math_get_ops();
 
 	return (rm);
 }
 
 /*
  * Everything before reflow_offset_synced should have been moved to the new
  * location (read and write completed).  However, this may not yet be reflected
  * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
  * uberblock has not yet been written). If reflow is not in progress,
  * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
  * entirely before reflow_offset_synced, it will come from the new location.
  * Otherwise this row will come from the old location.  Therefore, rows that
  * straddle the reflow_offset_synced will come from the old location.
  *
  * For writes, reflow_offset_next is the next offset to copy.  If a sector has
  * been copied, but not yet reflected in the on-disk progress
  * (reflow_offset_synced), it will also be written to the new (already copied)
  * offset.
  */
 noinline raidz_map_t *
 vdev_raidz_map_alloc_expanded(zio_t *zio,
     uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
     uint64_t nparity, uint64_t reflow_offset_synced,
     uint64_t reflow_offset_next, boolean_t use_scratch)
 {
 	abd_t *abd = zio->io_abd;
 	uint64_t offset = zio->io_offset;
 	uint64_t size = zio->io_size;
 
 	/* The zio's size in units of the vdev's minimum sector size. */
 	uint64_t s = size >> ashift;
 
 	/*
 	 * "Quotient": The number of data sectors for this stripe on all but
 	 * the "big column" child vdevs that also contain "remainder" data.
 	 * AKA "full rows"
 	 */
 	uint64_t q = s / (logical_cols - nparity);
 
 	/*
 	 * "Remainder": The number of partial stripe data sectors in this I/O.
 	 * This will add a sector to some, but not all, child vdevs.
 	 */
 	uint64_t r = s - q * (logical_cols - nparity);
 
 	/* The number of "big columns" - those which contain remainder data. */
 	uint64_t bc = (r == 0 ? 0 : r + nparity);
 
 	/*
 	 * The total number of data and parity sectors associated with
 	 * this I/O.
 	 */
 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
 
 	/* How many rows contain data (not skip) */
 	uint64_t rows = howmany(tot, logical_cols);
 	int cols = MIN(tot, logical_cols);
 
 	raidz_map_t *rm =
 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
 	    KM_SLEEP);
 	rm->rm_nrows = rows;
 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 	rm->rm_skipstart = bc;
 	uint64_t asize = 0;
 
 	for (uint64_t row = 0; row < rows; row++) {
 		boolean_t row_use_scratch = B_FALSE;
 		raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio);
 		rm->rm_row[row] = rr;
 
 		/* The starting RAIDZ (parent) vdev sector of the row. */
 		uint64_t b = (offset >> ashift) + row * logical_cols;
 
 		/*
 		 * If we are in the middle of a reflow, and the copying has
 		 * not yet completed for any part of this row, then use the
 		 * old location of this row.  Note that reflow_offset_synced
 		 * reflects the i/o that's been completed, because it's
 		 * updated by a synctask, after zio_wait(spa_txg_zio[]).
 		 * This is sufficient for our check, even if that progress
 		 * has not yet been recorded to disk (reflected in
 		 * spa_ubsync).  Also note that we consider the last row to
 		 * be "full width" (`cols`-wide rather than `bc`-wide) for
 		 * this calculation. This causes a tiny bit of unnecessary
 		 * double-writes but is safe and simpler to calculate.
 		 */
 		int row_phys_cols = physical_cols;
 		if (b + cols > reflow_offset_synced >> ashift)
 			row_phys_cols--;
 		else if (use_scratch)
 			row_use_scratch = B_TRUE;
 
 		/* starting child of this row */
 		uint64_t child_id = b % row_phys_cols;
 		/* The starting byte offset on each child vdev. */
 		uint64_t child_offset = (b / row_phys_cols) << ashift;
 
 		/*
 		 * Note, rr_cols is the entire width of the block, even
 		 * if this row is shorter.  This is needed because parity
 		 * generation (for Q and R) needs to know the entire width,
 		 * because it treats the short row as though it was
 		 * full-width (and the "phantom" sectors were zero-filled).
 		 *
 		 * Another approach to this would be to set cols shorter
 		 * (to just the number of columns that we might do i/o to)
 		 * and have another mechanism to tell the parity generation
 		 * about the "entire width".  Reconstruction (at least
 		 * vdev_raidz_reconstruct_general()) would also need to
 		 * know about the "entire width".
 		 */
 		rr->rr_firstdatacol = nparity;
 #ifdef ZFS_DEBUG
 		/*
 		 * note: rr_size is PSIZE, not ASIZE
 		 */
 		rr->rr_offset = b << ashift;
 		rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
 #endif
 
 		for (int c = 0; c < rr->rr_cols; c++, child_id++) {
 			if (child_id >= row_phys_cols) {
 				child_id -= row_phys_cols;
 				child_offset += 1ULL << ashift;
 			}
 			raidz_col_t *rc = &rr->rr_col[c];
 			rc->rc_devidx = child_id;
 			rc->rc_offset = child_offset;
 
 			/*
 			 * Get this from the scratch space if appropriate.
 			 * This only happens if we crashed in the middle of
 			 * raidz_reflow_scratch_sync() (while it's running,
 			 * the rangelock prevents us from doing concurrent
 			 * io), and even then only during zpool import or
 			 * when the pool is imported readonly.
 			 */
 			if (row_use_scratch)
 				rc->rc_offset -= VDEV_BOOT_SIZE;
 
 			uint64_t dc = c - rr->rr_firstdatacol;
 			if (c < rr->rr_firstdatacol) {
 				rc->rc_size = 1ULL << ashift;
 
 				/*
 				 * Parity sectors' rc_abd's are set below
 				 * after determining if this is an aggregation.
 				 */
 			} else if (row == rows - 1 && bc != 0 && c >= bc) {
 				/*
 				 * Past the end of the block (even including
 				 * skip sectors).  This sector is part of the
 				 * map so that we have full rows for p/q parity
 				 * generation.
 				 */
 				rc->rc_size = 0;
 				rc->rc_abd = NULL;
 			} else {
 				/* "data column" (col excluding parity) */
 				uint64_t off;
 
 				if (c < bc || r == 0) {
 					off = dc * rows + row;
 				} else {
 					off = r * rows +
 					    (dc - r) * (rows - 1) + row;
 				}
 				rc->rc_size = 1ULL << ashift;
 				rc->rc_abd = abd_get_offset_struct(
 				    &rc->rc_abdstruct, abd, off << ashift,
 				    rc->rc_size);
 			}
 
 			if (rc->rc_size == 0)
 				continue;
 
 			/*
 			 * If any part of this row is in both old and new
 			 * locations, the primary location is the old
 			 * location. If this sector was already copied to the
 			 * new location, we need to also write to the new,
 			 * "shadow" location.
 			 *
 			 * Note, `row_phys_cols != physical_cols` indicates
 			 * that the primary location is the old location.
 			 * `b+c < reflow_offset_next` indicates that the copy
 			 * to the new location has been initiated. We know
 			 * that the copy has completed because we have the
 			 * rangelock, which is held exclusively while the
 			 * copy is in progress.
 			 */
 			if (row_use_scratch ||
 			    (row_phys_cols != physical_cols &&
 			    b + c < reflow_offset_next >> ashift)) {
 				rc->rc_shadow_devidx = (b + c) % physical_cols;
 				rc->rc_shadow_offset =
 				    ((b + c) / physical_cols) << ashift;
 				if (row_use_scratch)
 					rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
 			}
 
 			asize += rc->rc_size;
 		}
 
 		/*
 		 * See comment in vdev_raidz_map_alloc()
 		 */
 		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
 		    (offset & (1ULL << 20))) {
 			ASSERT(rr->rr_cols >= 2);
 			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
 
 			int devidx0 = rr->rr_col[0].rc_devidx;
 			uint64_t offset0 = rr->rr_col[0].rc_offset;
 			int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
 			uint64_t shadow_offset0 =
 			    rr->rr_col[0].rc_shadow_offset;
 
 			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
 			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
 			rr->rr_col[0].rc_shadow_devidx =
 			    rr->rr_col[1].rc_shadow_devidx;
 			rr->rr_col[0].rc_shadow_offset =
 			    rr->rr_col[1].rc_shadow_offset;
 
 			rr->rr_col[1].rc_devidx = devidx0;
 			rr->rr_col[1].rc_offset = offset0;
 			rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
 			rr->rr_col[1].rc_shadow_offset = shadow_offset0;
 		}
 	}
 	ASSERT3U(asize, ==, tot << ashift);
 
 	/*
 	 * Determine if the block is contiguous, in which case we can use
 	 * an aggregation.
 	 */
 	if (rows >= raidz_io_aggregate_rows) {
 		rm->rm_nphys_cols = physical_cols;
 		rm->rm_phys_col =
 		    kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
 		    KM_SLEEP);
 
 		/*
 		 * Determine the aggregate io's offset and size, and check
 		 * that the io is contiguous.
 		 */
 		for (int i = 0;
 		    i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			for (int c = 0; c < rr->rr_cols; c++) {
 				raidz_col_t *rc = &rr->rr_col[c];
 				raidz_col_t *prc =
 				    &rm->rm_phys_col[rc->rc_devidx];
 
 				if (rc->rc_size == 0)
 					continue;
 
 				if (prc->rc_size == 0) {
 					ASSERT0(prc->rc_offset);
 					prc->rc_offset = rc->rc_offset;
 				} else if (prc->rc_offset + prc->rc_size !=
 				    rc->rc_offset) {
 					/*
 					 * This block is not contiguous and
 					 * therefore can't be aggregated.
 					 * This is expected to be rare, so
 					 * the cost of allocating and then
 					 * freeing rm_phys_col is not
 					 * significant.
 					 */
 					kmem_free(rm->rm_phys_col,
 					    sizeof (raidz_col_t) *
 					    rm->rm_nphys_cols);
 					rm->rm_phys_col = NULL;
 					rm->rm_nphys_cols = 0;
 					break;
 				}
 				prc->rc_size += rc->rc_size;
 			}
 		}
 	}
 	if (rm->rm_phys_col != NULL) {
 		/*
 		 * Allocate aggregate ABD's.
 		 */
 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
 			raidz_col_t *prc = &rm->rm_phys_col[i];
 
 			prc->rc_devidx = i;
 
 			if (prc->rc_size == 0)
 				continue;
 
 			prc->rc_abd =
 			    abd_alloc_linear(rm->rm_phys_col[i].rc_size,
 			    B_FALSE);
 		}
 
 		/*
 		 * Point the parity abd's into the aggregate abd's.
 		 */
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
 				raidz_col_t *rc = &rr->rr_col[c];
 				raidz_col_t *prc =
 				    &rm->rm_phys_col[rc->rc_devidx];
 				rc->rc_abd =
 				    abd_get_offset_struct(&rc->rc_abdstruct,
 				    prc->rc_abd,
 				    rc->rc_offset - prc->rc_offset,
 				    rc->rc_size);
 			}
 		}
 	} else {
 		/*
 		 * Allocate new abd's for the parity sectors.
 		 */
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
 				raidz_col_t *rc = &rr->rr_col[c];
 				rc->rc_abd =
 				    abd_alloc_linear(rc->rc_size,
 				    B_TRUE);
 			}
 		}
 	}
 	/* init RAIDZ parity ops */
 	rm->rm_ops = vdev_raidz_math_get_ops();
 
 	return (rm);
 }
 
 struct pqr_struct {
 	uint64_t *p;
 	uint64_t *q;
 	uint64_t *r;
 };
 
 static int
 vdev_raidz_p_func(void *buf, size_t size, void *private)
 {
 	struct pqr_struct *pqr = private;
 	const uint64_t *src = buf;
 	int cnt = size / sizeof (src[0]);
 
 	ASSERT(pqr->p && !pqr->q && !pqr->r);
 
 	for (int i = 0; i < cnt; i++, src++, pqr->p++)
 		*pqr->p ^= *src;
 
 	return (0);
 }
 
 static int
 vdev_raidz_pq_func(void *buf, size_t size, void *private)
 {
 	struct pqr_struct *pqr = private;
 	const uint64_t *src = buf;
 	uint64_t mask;
 	int cnt = size / sizeof (src[0]);
 
 	ASSERT(pqr->p && pqr->q && !pqr->r);
 
 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
 		*pqr->p ^= *src;
 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 		*pqr->q ^= *src;
 	}
 
 	return (0);
 }
 
 static int
 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
 {
 	struct pqr_struct *pqr = private;
 	const uint64_t *src = buf;
 	uint64_t mask;
 	int cnt = size / sizeof (src[0]);
 
 	ASSERT(pqr->p && pqr->q && pqr->r);
 
 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
 		*pqr->p ^= *src;
 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 		*pqr->q ^= *src;
 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
 		*pqr->r ^= *src;
 	}
 
 	return (0);
 }
 
 static void
 vdev_raidz_generate_parity_p(raidz_row_t *rr)
 {
 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 
 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		abd_t *src = rr->rr_col[c].rc_abd;
 
 		if (c == rr->rr_firstdatacol) {
 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
 		} else {
 			struct pqr_struct pqr = { p, NULL, NULL };
 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 			    vdev_raidz_p_func, &pqr);
 		}
 	}
 }
 
 static void
 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
 {
 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
 
 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		abd_t *src = rr->rr_col[c].rc_abd;
 
 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
 
 		if (c == rr->rr_firstdatacol) {
 			ASSERT(ccnt == pcnt || ccnt == 0);
 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
 
 			for (uint64_t i = ccnt; i < pcnt; i++) {
 				p[i] = 0;
 				q[i] = 0;
 			}
 		} else {
 			struct pqr_struct pqr = { p, q, NULL };
 
 			ASSERT(ccnt <= pcnt);
 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 			    vdev_raidz_pq_func, &pqr);
 
 			/*
 			 * Treat short columns as though they are full of 0s.
 			 * Note that there's therefore nothing needed for P.
 			 */
 			uint64_t mask;
 			for (uint64_t i = ccnt; i < pcnt; i++) {
 				VDEV_RAIDZ_64MUL_2(q[i], mask);
 			}
 		}
 	}
 }
 
 static void
 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
 {
 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 	uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
 	    rr->rr_col[VDEV_RAIDZ_R].rc_size);
 
 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		abd_t *src = rr->rr_col[c].rc_abd;
 
 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
 
 		if (c == rr->rr_firstdatacol) {
 			ASSERT(ccnt == pcnt || ccnt == 0);
 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
 			(void) memcpy(r, p, rr->rr_col[c].rc_size);
 
 			for (uint64_t i = ccnt; i < pcnt; i++) {
 				p[i] = 0;
 				q[i] = 0;
 				r[i] = 0;
 			}
 		} else {
 			struct pqr_struct pqr = { p, q, r };
 
 			ASSERT(ccnt <= pcnt);
 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 			    vdev_raidz_pqr_func, &pqr);
 
 			/*
 			 * Treat short columns as though they are full of 0s.
 			 * Note that there's therefore nothing needed for P.
 			 */
 			uint64_t mask;
 			for (uint64_t i = ccnt; i < pcnt; i++) {
 				VDEV_RAIDZ_64MUL_2(q[i], mask);
 				VDEV_RAIDZ_64MUL_4(r[i], mask);
 			}
 		}
 	}
 }
 
 /*
  * Generate RAID parity in the first virtual columns according to the number of
  * parity columns available.
  */
 void
 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
 {
 	if (rr->rr_cols == 0) {
 		/*
 		 * We are handling this block one row at a time (because
 		 * this block has a different logical vs physical width,
 		 * due to RAIDZ expansion), and this is a pad-only row,
 		 * which has no parity.
 		 */
 		return;
 	}
 
 	/* Generate using the new math implementation */
 	if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
 		return;
 
 	switch (rr->rr_firstdatacol) {
 	case 1:
 		vdev_raidz_generate_parity_p(rr);
 		break;
 	case 2:
 		vdev_raidz_generate_parity_pq(rr);
 		break;
 	case 3:
 		vdev_raidz_generate_parity_pqr(rr);
 		break;
 	default:
 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 	}
 }
 
 void
 vdev_raidz_generate_parity(raidz_map_t *rm)
 {
 	for (int i = 0; i < rm->rm_nrows; i++) {
 		raidz_row_t *rr = rm->rm_row[i];
 		vdev_raidz_generate_parity_row(rm, rr);
 	}
 }
 
 static int
 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
 {
 	(void) private;
 	uint64_t *dst = dbuf;
 	uint64_t *src = sbuf;
 	int cnt = size / sizeof (src[0]);
 
 	for (int i = 0; i < cnt; i++) {
 		dst[i] ^= src[i];
 	}
 
 	return (0);
 }
 
 static int
 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
     void *private)
 {
 	(void) private;
 	uint64_t *dst = dbuf;
 	uint64_t *src = sbuf;
 	uint64_t mask;
 	int cnt = size / sizeof (dst[0]);
 
 	for (int i = 0; i < cnt; i++, dst++, src++) {
 		VDEV_RAIDZ_64MUL_2(*dst, mask);
 		*dst ^= *src;
 	}
 
 	return (0);
 }
 
 static int
 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
 {
 	(void) private;
 	uint64_t *dst = buf;
 	uint64_t mask;
 	int cnt = size / sizeof (dst[0]);
 
 	for (int i = 0; i < cnt; i++, dst++) {
 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
 		VDEV_RAIDZ_64MUL_2(*dst, mask);
 	}
 
 	return (0);
 }
 
 struct reconst_q_struct {
 	uint64_t *q;
 	int exp;
 };
 
 static int
 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
 {
 	struct reconst_q_struct *rq = private;
 	uint64_t *dst = buf;
 	int cnt = size / sizeof (dst[0]);
 
 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
 		int j;
 		uint8_t *b;
 
 		*dst ^= *rq->q;
 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 			*b = vdev_raidz_exp2(*b, rq->exp);
 		}
 	}
 
 	return (0);
 }
 
 struct reconst_pq_struct {
 	uint8_t *p;
 	uint8_t *q;
 	uint8_t *pxy;
 	uint8_t *qxy;
 	int aexp;
 	int bexp;
 };
 
 static int
 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
 {
 	struct reconst_pq_struct *rpq = private;
 	uint8_t *xd = xbuf;
 	uint8_t *yd = ybuf;
 
 	for (int i = 0; i < size;
 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
 	}
 
 	return (0);
 }
 
 static int
 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
 {
 	struct reconst_pq_struct *rpq = private;
 	uint8_t *xd = xbuf;
 
 	for (int i = 0; i < size;
 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 	}
 
 	return (0);
 }
 
 static void
 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	int x = tgts[0];
 	abd_t *dst, *src;
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
 
 	ASSERT3U(ntgts, ==, 1);
 	ASSERT3U(x, >=, rr->rr_firstdatacol);
 	ASSERT3U(x, <, rr->rr_cols);
 
 	ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
 
 	src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
 	dst = rr->rr_col[x].rc_abd;
 
 	abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
 
 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		uint64_t size = MIN(rr->rr_col[x].rc_size,
 		    rr->rr_col[c].rc_size);
 
 		src = rr->rr_col[c].rc_abd;
 
 		if (c == x)
 			continue;
 
 		(void) abd_iterate_func2(dst, src, 0, 0, size,
 		    vdev_raidz_reconst_p_func, NULL);
 	}
 }
 
 static void
 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	int x = tgts[0];
 	int c, exp;
 	abd_t *dst, *src;
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
 
 	ASSERT(ntgts == 1);
 
 	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
 
 	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
 		    rr->rr_col[c].rc_size);
 
 		src = rr->rr_col[c].rc_abd;
 		dst = rr->rr_col[x].rc_abd;
 
 		if (c == rr->rr_firstdatacol) {
 			abd_copy(dst, src, size);
 			if (rr->rr_col[x].rc_size > size) {
 				abd_zero_off(dst, size,
 				    rr->rr_col[x].rc_size - size);
 			}
 		} else {
 			ASSERT3U(size, <=, rr->rr_col[x].rc_size);
 			(void) abd_iterate_func2(dst, src, 0, 0, size,
 			    vdev_raidz_reconst_q_pre_func, NULL);
 			(void) abd_iterate_func(dst,
 			    size, rr->rr_col[x].rc_size - size,
 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
 		}
 	}
 
 	src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
 	dst = rr->rr_col[x].rc_abd;
 	exp = 255 - (rr->rr_cols - 1 - x);
 
 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
 	(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
 	    vdev_raidz_reconst_q_post_func, &rq);
 }
 
 static void
 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
 	abd_t *pdata, *qdata;
 	uint64_t xsize, ysize;
 	int x = tgts[0];
 	int y = tgts[1];
 	abd_t *xd, *yd;
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
 
 	ASSERT(ntgts == 2);
 	ASSERT(x < y);
 	ASSERT(x >= rr->rr_firstdatacol);
 	ASSERT(y < rr->rr_cols);
 
 	ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
 
 	/*
 	 * Move the parity data aside -- we're going to compute parity as
 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 	 * reuse the parity generation mechanism without trashing the actual
 	 * parity so we make those columns appear to be full of zeros by
 	 * setting their lengths to zero.
 	 */
 	pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
 	qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
 	xsize = rr->rr_col[x].rc_size;
 	ysize = rr->rr_col[y].rc_size;
 
 	rr->rr_col[VDEV_RAIDZ_P].rc_abd =
 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
 	rr->rr_col[x].rc_size = 0;
 	rr->rr_col[y].rc_size = 0;
 
 	vdev_raidz_generate_parity_pq(rr);
 
 	rr->rr_col[x].rc_size = xsize;
 	rr->rr_col[y].rc_size = ysize;
 
 	p = abd_to_buf(pdata);
 	q = abd_to_buf(qdata);
 	pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 	qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 	xd = rr->rr_col[x].rc_abd;
 	yd = rr->rr_col[y].rc_abd;
 
 	/*
 	 * We now have:
 	 *	Pxy = P + D_x + D_y
 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 	 *
 	 * We can then solve for D_x:
 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
 	 * where
 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 	 *
 	 * With D_x in hand, we can easily solve for D_y:
 	 *	D_y = P + Pxy + D_x
 	 */
 
 	a = vdev_raidz_pow2[255 + x - y];
 	b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
 	tmp = 255 - vdev_raidz_log2[a ^ 1];
 
 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 
 	ASSERT3U(xsize, >=, ysize);
 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
 
 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
 	    vdev_raidz_reconst_pq_func, &rpq);
 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
 	    vdev_raidz_reconst_pq_tail_func, &rpq);
 
 	abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 	abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 
 	/*
 	 * Restore the saved parity data.
 	 */
 	rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
 }
 
 /*
  * In the general case of reconstruction, we must solve the system of linear
  * equations defined by the coefficients used to generate parity as well as
  * the contents of the data and parity disks. This can be expressed with
  * vectors for the original data (D) and the actual data (d) and parity (p)
  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
  *
  *            __   __                     __     __
  *            |     |         __     __   |  p_0  |
  *            |  V  |         |  D_0  |   | p_m-1 |
  *            |     |    x    |   :   | = |  d_0  |
  *            |  I  |         | D_n-1 |   |   :   |
  *            |     |         ~~     ~~   | d_n-1 |
  *            ~~   ~~                     ~~     ~~
  *
  * I is simply a square identity matrix of size n, and V is a vandermonde
  * matrix defined by the coefficients we chose for the various parity columns
  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
  * computation as well as linear separability.
  *
  *      __               __               __     __
  *      |   1   ..  1 1 1 |               |  p_0  |
  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
  *      |   :       : : : |   |   :   |   |  d_2  |
  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
  *      |   0   ..  0 0 1 |               | d_n-1 |
  *      ~~               ~~               ~~     ~~
  *
  * Note that I, V, d, and p are known. To compute D, we must invert the
  * matrix and use the known data and parity values to reconstruct the unknown
  * data values. We begin by removing the rows in V|I and d|p that correspond
  * to failed or missing columns; we then make V|I square (n x n) and d|p
  * sized n by removing rows corresponding to unused parity from the bottom up
  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
  * using Gauss-Jordan elimination. In the example below we use m=3 parity
  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
  *           __                               __
  *           |  1   1   1   1   1   1   1   1  |
  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
  *           |  19 205 116  29  64  16  4   1  |      / /
  *           |  1   0   0   0   0   0   0   0  |     / /
  *           |  0   1   0   0   0   0   0   0  | <--' /
  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
  *           |  0   0   0   1   0   0   0   0  |
  *           |  0   0   0   0   1   0   0   0  |
  *           |  0   0   0   0   0   1   0   0  |
  *           |  0   0   0   0   0   0   1   0  |
  *           |  0   0   0   0   0   0   0   1  |
  *           ~~                               ~~
  *           __                               __
  *           |  1   1   1   1   1   1   1   1  |
  *           | 128  64  32  16  8   4   2   1  |
  *           |  19 205 116  29  64  16  4   1  |
  *           |  1   0   0   0   0   0   0   0  |
  *           |  0   1   0   0   0   0   0   0  |
  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
  *           |  0   0   0   1   0   0   0   0  |
  *           |  0   0   0   0   1   0   0   0  |
  *           |  0   0   0   0   0   1   0   0  |
  *           |  0   0   0   0   0   0   1   0  |
  *           |  0   0   0   0   0   0   0   1  |
  *           ~~                               ~~
  *
  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
  * matrix is not singular.
  * __                                                                 __
  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  *                   __                               __
  *                   |  0   0   1   0   0   0   0   0  |
  *                   | 167 100  5   41 159 169 217 208 |
  *                   | 166 100  4   40 158 168 216 209 |
  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
  *                   |  0   0   0   0   1   0   0   0  |
  *                   |  0   0   0   0   0   1   0   0  |
  *                   |  0   0   0   0   0   0   1   0  |
  *                   |  0   0   0   0   0   0   0   1  |
  *                   ~~                               ~~
  *
  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
  * of the missing data.
  *
  * As is apparent from the example above, the only non-trivial rows in the
  * inverse matrix correspond to the data disks that we're trying to
  * reconstruct. Indeed, those are the only rows we need as the others would
  * only be useful for reconstructing data known or assumed to be valid. For
  * that reason, we only build the coefficients in the rows that correspond to
  * targeted columns.
  */
 
 static void
 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
     uint8_t **rows)
 {
 	int i, j;
 	int pow;
 
 	ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
 
 	/*
 	 * Fill in the missing rows of interest.
 	 */
 	for (i = 0; i < nmap; i++) {
 		ASSERT3S(0, <=, map[i]);
 		ASSERT3S(map[i], <=, 2);
 
 		pow = map[i] * n;
 		if (pow > 255)
 			pow -= 255;
 		ASSERT(pow <= 255);
 
 		for (j = 0; j < n; j++) {
 			pow -= map[i];
 			if (pow < 0)
 				pow += 255;
 			rows[i][j] = vdev_raidz_pow2[pow];
 		}
 	}
 }
 
 static void
 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
 {
 	int i, j, ii, jj;
 	uint8_t log;
 
 	/*
 	 * Assert that the first nmissing entries from the array of used
 	 * columns correspond to parity columns and that subsequent entries
 	 * correspond to data columns.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		ASSERT3S(used[i], <, rr->rr_firstdatacol);
 	}
 	for (; i < n; i++) {
 		ASSERT3S(used[i], >=, rr->rr_firstdatacol);
 	}
 
 	/*
 	 * First initialize the storage where we'll compute the inverse rows.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			invrows[i][j] = (i == j) ? 1 : 0;
 		}
 	}
 
 	/*
 	 * Subtract all trivial rows from the rows of consequence.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = nmissing; j < n; j++) {
 			ASSERT3U(used[j], >=, rr->rr_firstdatacol);
 			jj = used[j] - rr->rr_firstdatacol;
 			ASSERT3S(jj, <, n);
 			invrows[i][j] = rows[i][jj];
 			rows[i][jj] = 0;
 		}
 	}
 
 	/*
 	 * For each of the rows of interest, we must normalize it and subtract
 	 * a multiple of it from the other rows.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < missing[i]; j++) {
 			ASSERT0(rows[i][j]);
 		}
 		ASSERT3U(rows[i][missing[i]], !=, 0);
 
 		/*
 		 * Compute the inverse of the first element and multiply each
 		 * element in the row by that value.
 		 */
 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
 
 		for (j = 0; j < n; j++) {
 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
 		}
 
 		for (ii = 0; ii < nmissing; ii++) {
 			if (i == ii)
 				continue;
 
 			ASSERT3U(rows[ii][missing[i]], !=, 0);
 
 			log = vdev_raidz_log2[rows[ii][missing[i]]];
 
 			for (j = 0; j < n; j++) {
 				rows[ii][j] ^=
 				    vdev_raidz_exp2(rows[i][j], log);
 				invrows[ii][j] ^=
 				    vdev_raidz_exp2(invrows[i][j], log);
 			}
 		}
 	}
 
 	/*
 	 * Verify that the data that is left in the rows are properly part of
 	 * an identity matrix.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			if (j == missing[i]) {
 				ASSERT3U(rows[i][j], ==, 1);
 			} else {
 				ASSERT0(rows[i][j]);
 			}
 		}
 	}
 }
 
 static void
 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
     int *missing, uint8_t **invrows, const uint8_t *used)
 {
 	int i, j, x, cc, c;
 	uint8_t *src;
 	uint64_t ccount;
 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
 	uint8_t log = 0;
 	uint8_t val;
 	int ll;
 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *p, *pp;
 	size_t psize;
 
 	psize = sizeof (invlog[0][0]) * n * nmissing;
 	p = kmem_alloc(psize, KM_SLEEP);
 
 	for (pp = p, i = 0; i < nmissing; i++) {
 		invlog[i] = pp;
 		pp += n;
 	}
 
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			ASSERT3U(invrows[i][j], !=, 0);
 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
 		}
 	}
 
 	for (i = 0; i < n; i++) {
 		c = used[i];
 		ASSERT3U(c, <, rr->rr_cols);
 
 		ccount = rr->rr_col[c].rc_size;
 		ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
 		if (ccount == 0)
 			continue;
 		src = abd_to_buf(rr->rr_col[c].rc_abd);
 		for (j = 0; j < nmissing; j++) {
 			cc = missing[j] + rr->rr_firstdatacol;
 			ASSERT3U(cc, >=, rr->rr_firstdatacol);
 			ASSERT3U(cc, <, rr->rr_cols);
 			ASSERT3U(cc, !=, c);
 
 			dcount[j] = rr->rr_col[cc].rc_size;
 			if (dcount[j] != 0)
 				dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
 		}
 
 		for (x = 0; x < ccount; x++, src++) {
 			if (*src != 0)
 				log = vdev_raidz_log2[*src];
 
 			for (cc = 0; cc < nmissing; cc++) {
 				if (x >= dcount[cc])
 					continue;
 
 				if (*src == 0) {
 					val = 0;
 				} else {
 					if ((ll = log + invlog[cc][i]) >= 255)
 						ll -= 255;
 					val = vdev_raidz_pow2[ll];
 				}
 
 				if (i == 0)
 					dst[cc][x] = val;
 				else
 					dst[cc][x] ^= val;
 			}
 		}
 	}
 
 	kmem_free(p, psize);
 }
 
 static void
 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	int i, c, t, tt;
 	unsigned int n;
 	unsigned int nmissing_rows;
 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
 	int parity_map[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *p, *pp;
 	size_t psize;
 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *used;
 
 	abd_t **bufs = NULL;
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
 	/*
 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
 	 * temporary linear ABDs if any non-linear ABDs are found.
 	 */
 	for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
 		ASSERT(rr->rr_col[i].rc_abd != NULL);
 		if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
 			bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
 			    KM_PUSHPAGE);
 
 			for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 				raidz_col_t *col = &rr->rr_col[c];
 
 				bufs[c] = col->rc_abd;
 				if (bufs[c] != NULL) {
 					col->rc_abd = abd_alloc_linear(
 					    col->rc_size, B_TRUE);
 					abd_copy(col->rc_abd, bufs[c],
 					    col->rc_size);
 				}
 			}
 
 			break;
 		}
 	}
 
 	n = rr->rr_cols - rr->rr_firstdatacol;
 
 	/*
 	 * Figure out which data columns are missing.
 	 */
 	nmissing_rows = 0;
 	for (t = 0; t < ntgts; t++) {
 		if (tgts[t] >= rr->rr_firstdatacol) {
 			missing_rows[nmissing_rows++] =
 			    tgts[t] - rr->rr_firstdatacol;
 		}
 	}
 
 	/*
 	 * Figure out which parity columns to use to help generate the missing
 	 * data columns.
 	 */
 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
 		ASSERT(tt < ntgts);
 		ASSERT(c < rr->rr_firstdatacol);
 
 		/*
 		 * Skip any targeted parity columns.
 		 */
 		if (c == tgts[tt]) {
 			tt++;
 			continue;
 		}
 
 		parity_map[i] = c;
 		i++;
 	}
 
 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
 	    nmissing_rows * n + sizeof (used[0]) * n;
 	p = kmem_alloc(psize, KM_SLEEP);
 
 	for (pp = p, i = 0; i < nmissing_rows; i++) {
 		rows[i] = pp;
 		pp += n;
 		invrows[i] = pp;
 		pp += n;
 	}
 	used = pp;
 
 	for (i = 0; i < nmissing_rows; i++) {
 		used[i] = parity_map[i];
 	}
 
 	for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		if (tt < nmissing_rows &&
 		    c == missing_rows[tt] + rr->rr_firstdatacol) {
 			tt++;
 			continue;
 		}
 
 		ASSERT3S(i, <, n);
 		used[i] = c;
 		i++;
 	}
 
 	/*
 	 * Initialize the interesting rows of the matrix.
 	 */
 	vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
 
 	/*
 	 * Invert the matrix.
 	 */
 	vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
 	    invrows, used);
 
 	/*
 	 * Reconstruct the missing data using the generated matrix.
 	 */
 	vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
 	    invrows, used);
 
 	kmem_free(p, psize);
 
 	/*
 	 * copy back from temporary linear abds and free them
 	 */
 	if (bufs) {
 		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 			raidz_col_t *col = &rr->rr_col[c];
 
 			if (bufs[c] != NULL) {
 				abd_copy(bufs[c], col->rc_abd, col->rc_size);
 				abd_free(col->rc_abd);
 			}
 			col->rc_abd = bufs[c];
 		}
 		kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
 	}
 }
 
 static void
 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
     const int *t, int nt)
 {
 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
 	int ntgts;
 	int i, c, ret;
 	int nbadparity, nbaddata;
 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
 		zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
 		    rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
 		    (int)rr->rr_missingparity);
 	}
 
 	nbadparity = rr->rr_firstdatacol;
 	nbaddata = rr->rr_cols - nbadparity;
 	ntgts = 0;
 	for (i = 0, c = 0; c < rr->rr_cols; c++) {
 		if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
 			zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
 			    "offset=%llx error=%u)",
 			    rr, c, (int)rr->rr_col[c].rc_devidx,
 			    (long long)rr->rr_col[c].rc_offset,
 			    (int)rr->rr_col[c].rc_error);
 		}
 		if (c < rr->rr_firstdatacol)
 			parity_valid[c] = B_FALSE;
 
 		if (i < nt && c == t[i]) {
 			tgts[ntgts++] = c;
 			i++;
 		} else if (rr->rr_col[c].rc_error != 0) {
 			tgts[ntgts++] = c;
 		} else if (c >= rr->rr_firstdatacol) {
 			nbaddata--;
 		} else {
 			parity_valid[c] = B_TRUE;
 			nbadparity--;
 		}
 	}
 
 	ASSERT(ntgts >= nt);
 	ASSERT(nbaddata >= 0);
 	ASSERT(nbaddata + nbadparity == ntgts);
 
 	dt = &tgts[nbadparity];
 
 	/* Reconstruct using the new math implementation */
 	ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
 	if (ret != RAIDZ_ORIGINAL_IMPL)
 		return;
 
 	/*
 	 * See if we can use any of our optimized reconstruction routines.
 	 */
 	switch (nbaddata) {
 	case 1:
 		if (parity_valid[VDEV_RAIDZ_P]) {
 			vdev_raidz_reconstruct_p(rr, dt, 1);
 			return;
 		}
 
 		ASSERT(rr->rr_firstdatacol > 1);
 
 		if (parity_valid[VDEV_RAIDZ_Q]) {
 			vdev_raidz_reconstruct_q(rr, dt, 1);
 			return;
 		}
 
 		ASSERT(rr->rr_firstdatacol > 2);
 		break;
 
 	case 2:
 		ASSERT(rr->rr_firstdatacol > 1);
 
 		if (parity_valid[VDEV_RAIDZ_P] &&
 		    parity_valid[VDEV_RAIDZ_Q]) {
 			vdev_raidz_reconstruct_pq(rr, dt, 2);
 			return;
 		}
 
 		ASSERT(rr->rr_firstdatacol > 2);
 
 		break;
 	}
 
 	vdev_raidz_reconstruct_general(rr, tgts, ntgts);
 }
 
 static int
 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	uint64_t nparity = vdrz->vd_nparity;
 	int c;
 	int lasterror = 0;
 	int numerrors = 0;
 
 	ASSERT(nparity > 0);
 
 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
 	    vd->vdev_children < nparity + 1) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	vdev_open_children(vd);
 
 	for (c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error != 0) {
 			lasterror = cvd->vdev_open_error;
 			numerrors++;
 			continue;
 		}
 
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
 	}
 	for (c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error != 0)
 			continue;
 		*physical_ashift = vdev_best_ashift(*logical_ashift,
 		    *physical_ashift, cvd->vdev_physical_ashift);
 	}
 
 	if (vd->vdev_rz_expanding) {
 		*asize *= vd->vdev_children - 1;
 		*max_asize *= vd->vdev_children - 1;
 
 		vd->vdev_min_asize = *asize;
 	} else {
 		*asize *= vd->vdev_children;
 		*max_asize *= vd->vdev_children;
 	}
 
 	if (numerrors > nparity) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 		return (lasterror);
 	}
 
 	return (0);
 }
 
 static void
 vdev_raidz_close(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c] != NULL)
 			vdev_close(vd->vdev_child[c]);
 	}
 }
 
 /*
  * Return the logical width to use, given the txg in which the allocation
  * happened.  Note that BP_GET_BIRTH() is usually the txg in which the
  * BP was allocated.  Remapped BP's (that were relocated due to device
  * removal, see remap_blkptr_cb()), will have a more recent physical birth
  * which reflects when the BP was relocated, but we can ignore these because
  * they can't be on RAIDZ (device removal doesn't support RAIDZ).
  */
 static uint64_t
 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
 {
 	reflow_node_t lookup = {
 		.re_txg = txg,
 	};
 	avl_index_t where;
 
 	uint64_t width;
 	mutex_enter(&vdrz->vd_expand_lock);
 	reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
 	if (re != NULL) {
 		width = re->re_logical_width;
 	} else {
 		re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
 		if (re != NULL)
 			width = re->re_logical_width;
 		else
 			width = vdrz->vd_original_width;
 	}
 	mutex_exit(&vdrz->vd_expand_lock);
 	return (width);
 }
 
 /*
  * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
  * more space due to the lower data-to-parity ratio.  In this case it's
  * important to pass in the correct txg.  Note that vdev_gang_header_asize()
  * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
  * regardless of txg.  This is assured because for a single data sector, we
  * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
  */
 static uint64_t
 vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	uint64_t asize;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	uint64_t cols = vdrz->vd_original_width;
 	uint64_t nparity = vdrz->vd_nparity;
 
 	cols = vdev_raidz_get_logical_width(vdrz, txg);
 
 	asize = ((psize - 1) >> ashift) + 1;
 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
 	asize = roundup(asize, nparity + 1) << ashift;
 
 #ifdef ZFS_DEBUG
 	uint64_t asize_new = ((psize - 1) >> ashift) + 1;
 	uint64_t ncols_new = vdrz->vd_physical_width;
 	asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
 	    (ncols_new - nparity));
 	asize_new = roundup(asize_new, nparity + 1) << ashift;
 	VERIFY3U(asize_new, <=, asize);
 #endif
 
 	return (asize);
 }
 
 /*
  * The allocatable space for a raidz vdev is N * sizeof(smallest child)
  * so each child must provide at least 1/Nth of its asize.
  */
 static uint64_t
 vdev_raidz_min_asize(vdev_t *vd)
 {
 	return ((vd->vdev_min_asize + vd->vdev_children - 1) /
 	    vd->vdev_children);
 }
 
 void
 vdev_raidz_child_done(zio_t *zio)
 {
 	raidz_col_t *rc = zio->io_private;
 
 	ASSERT3P(rc->rc_abd, !=, NULL);
 	rc->rc_error = zio->io_error;
 	rc->rc_tried = 1;
 	rc->rc_skipped = 0;
 }
 
 static void
 vdev_raidz_shadow_child_done(zio_t *zio)
 {
 	raidz_col_t *rc = zio->io_private;
 
 	rc->rc_shadow_error = zio->io_error;
 }
 
 static void
 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
 {
 	(void) rm;
 #ifdef ZFS_DEBUG
-	range_seg64_t logical_rs, physical_rs, remain_rs;
+	zfs_range_seg64_t logical_rs, physical_rs, remain_rs;
 	logical_rs.rs_start = rr->rr_offset;
 	logical_rs.rs_end = logical_rs.rs_start +
 	    vdev_raidz_asize(zio->io_vd, rr->rr_size,
 	    BP_GET_BIRTH(zio->io_bp));
 
 	raidz_col_t *rc = &rr->rr_col[col];
 	vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
 
 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
 	ASSERT(vdev_xlate_is_empty(&remain_rs));
 	if (vdev_xlate_is_empty(&physical_rs)) {
 		/*
 		 * If we are in the middle of expansion, the
 		 * physical->logical mapping is changing so vdev_xlate()
 		 * can't give us a reliable answer.
 		 */
 		return;
 	}
 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
 	/*
 	 * It would be nice to assert that rs_end is equal
 	 * to rc_offset + rc_size but there might be an
 	 * optional I/O at the end that is not accounted in
 	 * rc_size.
 	 */
 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
 		    rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
 	} else {
 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
 	}
 #endif
 }
 
 static void
 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
 {
 	vdev_t *vd = zio->io_vd;
 	raidz_map_t *rm = zio->io_vsd;
 
 	vdev_raidz_generate_parity_row(rm, rr);
 
 	for (int c = 0; c < rr->rr_scols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 		/* Verify physical to logical translation */
 		vdev_raidz_io_verify(zio, rm, rr, c);
 
 		if (rc->rc_size == 0)
 			continue;
 
 		ASSERT3U(rc->rc_offset + rc->rc_size, <,
 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
 
 		ASSERT3P(rc->rc_abd, !=, NULL);
 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 		    rc->rc_offset, rc->rc_abd,
 		    abd_get_size(rc->rc_abd), zio->io_type,
 		    zio->io_priority, 0, vdev_raidz_child_done, rc));
 
 		if (rc->rc_shadow_devidx != INT_MAX) {
 			vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
 
 			ASSERT3U(
 			    rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
 			    cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
 
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
 			    rc->rc_shadow_offset, rc->rc_abd,
 			    abd_get_size(rc->rc_abd),
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_shadow_child_done, rc));
 		}
 	}
 }
 
 /*
  * Generate optional I/Os for skip sectors to improve aggregation contiguity.
  * This only works for vdev_raidz_map_alloc() (not _expanded()).
  */
 static void
 raidz_start_skip_writes(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	raidz_map_t *rm = zio->io_vsd;
 	ASSERT3U(rm->rm_nrows, ==, 1);
 	raidz_row_t *rr = rm->rm_row[0];
 	for (int c = 0; c < rr->rr_scols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 		if (rc->rc_size != 0)
 			continue;
 		ASSERT3P(rc->rc_abd, ==, NULL);
 
 		ASSERT3U(rc->rc_offset, <,
 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
 
 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
 		    NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
 		    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
 	}
 }
 
 static void
 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
 {
 	vdev_t *vd = zio->io_vd;
 
 	/*
 	 * Iterate over the columns in reverse order so that we hit the parity
 	 * last -- any errors along the way will force us to read the parity.
 	 */
 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		if (rc->rc_size == 0)
 			continue;
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 		if (!vdev_readable(cvd)) {
 			if (c >= rr->rr_firstdatacol)
 				rr->rr_missingdata++;
 			else
 				rr->rr_missingparity++;
 			rc->rc_error = SET_ERROR(ENXIO);
 			rc->rc_tried = 1;	/* don't even try */
 			rc->rc_skipped = 1;
 			continue;
 		}
 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
 			if (c >= rr->rr_firstdatacol)
 				rr->rr_missingdata++;
 			else
 				rr->rr_missingparity++;
 			rc->rc_error = SET_ERROR(ESTALE);
 			rc->rc_skipped = 1;
 			continue;
 		}
 		if (forceparity ||
 		    c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
 	}
 }
 
 static void
 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
 {
 	vdev_t *vd = zio->io_vd;
 
 	for (int i = 0; i < rm->rm_nphys_cols; i++) {
 		raidz_col_t *prc = &rm->rm_phys_col[i];
 		if (prc->rc_size == 0)
 			continue;
 
 		ASSERT3U(prc->rc_devidx, ==, i);
 		vdev_t *cvd = vd->vdev_child[i];
 		if (!vdev_readable(cvd)) {
 			prc->rc_error = SET_ERROR(ENXIO);
 			prc->rc_tried = 1;	/* don't even try */
 			prc->rc_skipped = 1;
 			continue;
 		}
 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
 			prc->rc_error = SET_ERROR(ESTALE);
 			prc->rc_skipped = 1;
 			continue;
 		}
 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 		    prc->rc_offset, prc->rc_abd, prc->rc_size,
 		    zio->io_type, zio->io_priority, 0,
 		    vdev_raidz_child_done, prc));
 	}
 }
 
 static void
 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
 {
 	/*
 	 * If there are multiple rows, we will be hitting
 	 * all disks, so go ahead and read the parity so
 	 * that we are reading in decent size chunks.
 	 */
 	boolean_t forceparity = rm->rm_nrows > 1;
 
 	if (rm->rm_phys_col) {
 		vdev_raidz_io_start_read_phys_cols(zio, rm);
 	} else {
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			vdev_raidz_io_start_read_row(zio, rr, forceparity);
 		}
 	}
 }
 
 /*
  * Start an IO operation on a RAIDZ VDev
  *
  * Outline:
  * - For write operations:
  *   1. Generate the parity data
  *   2. Create child zio write operations to each column's vdev, for both
  *      data and parity.
  *   3. If the column skips any sectors for padding, create optional dummy
  *      write zio children for those areas to improve aggregation continuity.
  * - For read operations:
  *   1. Create child zio read operations to each data column's vdev to read
  *      the range of data required for zio.
  *   2. If this is a scrub or resilver operation, or if any of the data
  *      vdevs have had errors, then create zio read operations to the parity
  *      columns' VDevs as well.
  */
 static void
 vdev_raidz_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_t *tvd = vd->vdev_top;
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	raidz_map_t *rm;
 
 	uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
 	    BP_GET_BIRTH(zio->io_bp));
 	if (logical_width != vdrz->vd_physical_width) {
 		zfs_locked_range_t *lr = NULL;
 		uint64_t synced_offset = UINT64_MAX;
 		uint64_t next_offset = UINT64_MAX;
 		boolean_t use_scratch = B_FALSE;
 		/*
 		 * Note: when the expansion is completing, we set
 		 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
 		 * in a later txg than when we last update spa_ubsync's state
 		 * (see the end of spa_raidz_expand_thread()).  Therefore we
 		 * may see vre_state!=SCANNING before
 		 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
 		 * on disk, but the copying progress has been synced to disk
 		 * (and reflected in spa_ubsync).  In this case it's fine to
 		 * treat the expansion as completed, since if we crash there's
 		 * no additional copying to do.
 		 */
 		if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
 			ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
 			    &vdrz->vn_vre);
 			lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
 			    zio->io_offset, zio->io_size, RL_READER);
 			use_scratch =
 			    (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
 			    RRSS_SCRATCH_VALID);
 			synced_offset =
 			    RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
 			next_offset = vdrz->vn_vre.vre_offset;
 			/*
 			 * If we haven't resumed expanding since importing the
 			 * pool, vre_offset won't have been set yet.  In
 			 * this case the next offset to be copied is the same
 			 * as what was synced.
 			 */
 			if (next_offset == UINT64_MAX) {
 				next_offset = synced_offset;
 			}
 		}
 		if (use_scratch) {
 			zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
 			    "%lld next_offset=%lld use_scratch=%u",
 			    zio,
 			    zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
 			    (long long)zio->io_offset,
 			    (long long)synced_offset,
 			    (long long)next_offset,
 			    use_scratch);
 		}
 
 		rm = vdev_raidz_map_alloc_expanded(zio,
 		    tvd->vdev_ashift, vdrz->vd_physical_width,
 		    logical_width, vdrz->vd_nparity,
 		    synced_offset, next_offset, use_scratch);
 		rm->rm_lr = lr;
 	} else {
 		rm = vdev_raidz_map_alloc(zio,
 		    tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
 	}
 	rm->rm_original_width = vdrz->vd_original_width;
 
 	zio->io_vsd = rm;
 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			vdev_raidz_io_start_write(zio, rm->rm_row[i]);
 		}
 
 		if (logical_width == vdrz->vd_physical_width) {
 			raidz_start_skip_writes(zio);
 		}
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_READ);
 		vdev_raidz_io_start_read(zio, rm);
 	}
 
 	zio_execute(zio);
 }
 
 /*
  * Report a checksum error for a child of a RAID-Z device.
  */
 void
 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
 {
 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
 
 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
 	    zio->io_priority != ZIO_PRIORITY_REBUILD) {
 		zio_bad_cksum_t zbc;
 		raidz_map_t *rm = zio->io_vsd;
 
 		zbc.zbc_has_cksum = 0;
 		zbc.zbc_injected = rm->rm_ecksuminjected;
 
 		mutex_enter(&vd->vdev_stat_lock);
 		vd->vdev_stat.vs_checksum_errors++;
 		mutex_exit(&vd->vdev_stat_lock);
 		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
 		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
 		    rc->rc_abd, bad_data, &zbc);
 	}
 }
 
 /*
  * We keep track of whether or not there were any injected errors, so that
  * any ereports we generate can note it.
  */
 static int
 raidz_checksum_verify(zio_t *zio)
 {
 	zio_bad_cksum_t zbc = {0};
 	raidz_map_t *rm = zio->io_vsd;
 
 	int ret = zio_checksum_error(zio, &zbc);
 	/*
 	 * Any Direct I/O read that has a checksum error must be treated as
 	 * suspicious as the contents of the buffer could be getting
 	 * manipulated while the I/O is taking place. The checksum verify error
 	 * will be reported to the top-level RAIDZ VDEV.
 	 */
 	if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
 		zio->io_error = ret;
 		zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
 		zio_dio_chksum_verify_error_report(zio);
 		zio_checksum_verified(zio);
 		return (0);
 	}
 
 	if (ret != 0 && zbc.zbc_injected != 0)
 		rm->rm_ecksuminjected = 1;
 
 	return (ret);
 }
 
 /*
  * Generate the parity from the data columns. If we tried and were able to
  * read the parity without error, verify that the generated parity matches the
  * data we read. If it doesn't, we fire off a checksum error. Return the
  * number of such failures.
  */
 static int
 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
 {
 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
 	int c, ret = 0;
 	raidz_map_t *rm = zio->io_vsd;
 	raidz_col_t *rc;
 
 	blkptr_t *bp = zio->io_bp;
 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
 
 	if (checksum == ZIO_CHECKSUM_NOPARITY)
 		return (ret);
 
 	for (c = 0; c < rr->rr_firstdatacol; c++) {
 		rc = &rr->rr_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
 
 		orig[c] = rc->rc_abd;
 		ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
 		rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
 	}
 
 	/*
 	 * Verify any empty sectors are zero filled to ensure the parity
 	 * is calculated correctly even if these non-data sectors are damaged.
 	 */
 	if (rr->rr_nempty && rr->rr_abd_empty != NULL)
 		ret += vdev_draid_map_verify_empty(zio, rr);
 
 	/*
 	 * Regenerates parity even for !tried||rc_error!=0 columns.  This
 	 * isn't harmful but it does have the side effect of fixing stuff
 	 * we didn't realize was necessary (i.e. even if we return 0).
 	 */
 	vdev_raidz_generate_parity_row(rm, rr);
 
 	for (c = 0; c < rr->rr_firstdatacol; c++) {
 		rc = &rr->rr_col[c];
 
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
 
 		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
 			zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
 			    c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
 			vdev_raidz_checksum_error(zio, rc, orig[c]);
 			rc->rc_error = SET_ERROR(ECKSUM);
 			ret++;
 		}
 		abd_free(orig[c]);
 	}
 
 	return (ret);
 }
 
 static int
 vdev_raidz_worst_error(raidz_row_t *rr)
 {
 	int error = 0;
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		error = zio_worst_error(error, rr->rr_col[c].rc_error);
 		error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
 	}
 
 	return (error);
 }
 
 static void
 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
 {
 	int unexpected_errors = 0;
 	int parity_errors = 0;
 	int parity_untried = 0;
 	int data_errors = 0;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_error) {
 			if (c < rr->rr_firstdatacol)
 				parity_errors++;
 			else
 				data_errors++;
 
 			if (!rc->rc_skipped)
 				unexpected_errors++;
 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
 			parity_untried++;
 		}
 
 		if (rc->rc_force_repair)
 			unexpected_errors++;
 	}
 
 	/*
 	 * If we read more parity disks than were used for
 	 * reconstruction, confirm that the other parity disks produced
 	 * correct data.
 	 *
 	 * Note that we also regenerate parity when resilvering so we
 	 * can write it out to failed devices later.
 	 */
 	if (parity_errors + parity_untried <
 	    rr->rr_firstdatacol - data_errors ||
 	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
 		int n = raidz_parity_verify(zio, rr);
 		unexpected_errors += n;
 	}
 
 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
 	    (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
 		/*
 		 * Use the good data we have in hand to repair damaged children.
 		 */
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			vdev_t *vd = zio->io_vd;
 			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 			if (!rc->rc_allow_repair) {
 				continue;
 			} else if (!rc->rc_force_repair &&
 			    (rc->rc_error == 0 || rc->rc_size == 0)) {
 				continue;
 			}
 			/*
 			 * We do not allow self healing for Direct I/O reads.
 			 * See comment in vdev_raid_row_alloc().
 			 */
 			ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
 
 			zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
 			    "offset=%llx",
 			    zio, c, rc->rc_devidx, (long long)rc->rc_offset);
 
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    ZIO_TYPE_WRITE,
 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
 	}
 
 	/*
 	 * Scrub or resilver i/o's: overwrite any shadow locations with the
 	 * good data.  This ensures that if we've already copied this sector,
 	 * it will be corrected if it was damaged.  This writes more than is
 	 * necessary, but since expansion is paused during scrub/resilver, at
 	 * most a single row will have a shadow location.
 	 */
 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
 	    (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			vdev_t *vd = zio->io_vd;
 
 			if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
 				continue;
 			vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
 
 			/*
 			 * Note: We don't want to update the repair stats
 			 * because that would incorrectly indicate that there
 			 * was bad data to repair, which we aren't sure about.
 			 * By clearing the SCAN_THREAD flag, we prevent this
 			 * from happening, despite having the REPAIR flag set.
 			 * We need to set SELF_HEAL so that this i/o can't be
 			 * bypassed by zio_vdev_io_start().
 			 */
 			zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
 			    NULL, NULL);
 			cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
 			zio_nowait(cio);
 		}
 	}
 }
 
 static void
 raidz_restore_orig_data(raidz_map_t *rm)
 {
 	for (int i = 0; i < rm->rm_nrows; i++) {
 		raidz_row_t *rr = rm->rm_row[i];
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			if (rc->rc_need_orig_restore) {
 				abd_copy(rc->rc_abd,
 				    rc->rc_orig_data, rc->rc_size);
 				rc->rc_need_orig_restore = B_FALSE;
 			}
 		}
 	}
 }
 
 /*
  * During raidz_reconstruct() for expanded VDEV, we need special consideration
  * failure simulations.  See note in raidz_reconstruct() on simulating failure
  * of a pre-expansion device.
  *
  * Treating logical child i as failed, return TRUE if the given column should
  * be treated as failed.  The idea of logical children allows us to imagine
  * that a disk silently failed before a RAIDZ expansion (reads from this disk
  * succeed but return the wrong data).  Since the expansion doesn't verify
  * checksums, the incorrect data will be moved to new locations spread among
  * the children (going diagonally across them).
  *
  * Higher "logical child failures" (values of `i`) indicate these
  * "pre-expansion failures".  The first physical_width values imagine that a
  * current child failed; the next physical_width-1 values imagine that a
  * child failed before the most recent expansion; the next physical_width-2
  * values imagine a child failed in the expansion before that, etc.
  */
 static boolean_t
 raidz_simulate_failure(int physical_width, int original_width, int ashift,
     int i, raidz_col_t *rc)
 {
 	uint64_t sector_id =
 	    physical_width * (rc->rc_offset >> ashift) +
 	    rc->rc_devidx;
 
 	for (int w = physical_width; w >= original_width; w--) {
 		if (i < w) {
 			return (sector_id % w == i);
 		} else {
 			i -= w;
 		}
 	}
 	ASSERT(!"invalid logical child id");
 	return (B_FALSE);
 }
 
 /*
  * returns EINVAL if reconstruction of the block will not be possible
  * returns ECKSUM if this specific reconstruction failed
  * returns 0 on successful reconstruction
  */
 static int
 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
 {
 	raidz_map_t *rm = zio->io_vsd;
 	int physical_width = zio->io_vd->vdev_children;
 	int original_width = (rm->rm_original_width != 0) ?
 	    rm->rm_original_width : physical_width;
 	int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
 
 	if (dbgmsg) {
 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
 		    "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
 	}
 
 	/* Reconstruct each row */
 	for (int r = 0; r < rm->rm_nrows; r++) {
 		raidz_row_t *rr = rm->rm_row[r];
 		int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
 		int t = 0;
 		int dead = 0;
 		int dead_data = 0;
 
 		if (dbgmsg)
 			zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
 
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			ASSERT0(rc->rc_need_orig_restore);
 			if (rc->rc_error != 0) {
 				dead++;
 				if (c >= nparity)
 					dead_data++;
 				continue;
 			}
 			if (rc->rc_size == 0)
 				continue;
 			for (int lt = 0; lt < ntgts; lt++) {
 				if (raidz_simulate_failure(physical_width,
 				    original_width,
 				    zio->io_vd->vdev_top->vdev_ashift,
 				    ltgts[lt], rc)) {
 					if (rc->rc_orig_data == NULL) {
 						rc->rc_orig_data =
 						    abd_alloc_linear(
 						    rc->rc_size, B_TRUE);
 						abd_copy(rc->rc_orig_data,
 						    rc->rc_abd, rc->rc_size);
 					}
 					rc->rc_need_orig_restore = B_TRUE;
 
 					dead++;
 					if (c >= nparity)
 						dead_data++;
 					/*
 					 * Note: simulating failure of a
 					 * pre-expansion device can hit more
 					 * than one column, in which case we
 					 * might try to simulate more failures
 					 * than can be reconstructed, which is
 					 * also more than the size of my_tgts.
 					 * This check prevents accessing past
 					 * the end of my_tgts.  The "dead >
 					 * nparity" check below will fail this
 					 * reconstruction attempt.
 					 */
 					if (t < VDEV_RAIDZ_MAXPARITY) {
 						my_tgts[t++] = c;
 						if (dbgmsg) {
 							zfs_dbgmsg("simulating "
 							    "failure of col %u "
 							    "devidx %u", c,
 							    (int)rc->rc_devidx);
 						}
 					}
 					break;
 				}
 			}
 		}
 		if (dead > nparity) {
 			/* reconstruction not possible */
 			if (dbgmsg) {
 				zfs_dbgmsg("reconstruction not possible; "
 				    "too many failures");
 			}
 			raidz_restore_orig_data(rm);
 			return (EINVAL);
 		}
 		if (dead_data > 0)
 			vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
 	}
 
 	/* Check for success */
 	if (raidz_checksum_verify(zio) == 0) {
 		if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
 			return (0);
 
 		/* Reconstruction succeeded - report errors */
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 
 			for (int c = 0; c < rr->rr_cols; c++) {
 				raidz_col_t *rc = &rr->rr_col[c];
 				if (rc->rc_need_orig_restore) {
 					/*
 					 * Note: if this is a parity column,
 					 * we don't really know if it's wrong.
 					 * We need to let
 					 * vdev_raidz_io_done_verified() check
 					 * it, and if we set rc_error, it will
 					 * think that it is a "known" error
 					 * that doesn't need to be checked
 					 * or corrected.
 					 */
 					if (rc->rc_error == 0 &&
 					    c >= rr->rr_firstdatacol) {
 						vdev_raidz_checksum_error(zio,
 						    rc, rc->rc_orig_data);
 						rc->rc_error =
 						    SET_ERROR(ECKSUM);
 					}
 					rc->rc_need_orig_restore = B_FALSE;
 				}
 			}
 
 			vdev_raidz_io_done_verified(zio, rr);
 		}
 
 		zio_checksum_verified(zio);
 
 		if (dbgmsg) {
 			zfs_dbgmsg("reconstruction successful "
 			    "(checksum verified)");
 		}
 		return (0);
 	}
 
 	/* Reconstruction failed - restore original data */
 	raidz_restore_orig_data(rm);
 	if (dbgmsg) {
 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
 		    "failed", zio);
 	}
 	return (ECKSUM);
 }
 
 /*
  * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
  * Note that the algorithm below is non-optimal because it doesn't take into
  * account how reconstruction is actually performed. For example, with
  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
  * is targeted as invalid as if columns 1 and 4 are targeted since in both
  * cases we'd only use parity information in column 0.
  *
  * The order that we find the various possible combinations of failed
  * disks is dictated by these rules:
  * - Examine each "slot" (the "i" in tgts[i])
  *   - Try to increment this slot (tgts[i] += 1)
  *   - if we can't increment because it runs into the next slot,
  *     reset our slot to the minimum, and examine the next slot
  *
  *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
  *  3 columns to reconstruct), we will generate the following sequence:
  *
  *  STATE        ACTION
  *  0 1 2        special case: skip since these are all parity
  *  0 1   3      first slot: reset to 0; middle slot: increment to 2
  *  0   2 3      first slot: increment to 1
  *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
  *  0 1     4    first: reset to 0; middle: increment to 2
  *  0   2   4    first: increment to 1
  *    1 2   4    first: reset to 0; middle: increment to 3
  *  0     3 4    first: increment to 1
  *    1   3 4    first: increment to 2
  *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
  *  0 1       5  first: reset to 0; middle: increment to 2
  *  0   2     5  first: increment to 1
  *    1 2     5  first: reset to 0; middle: increment to 3
  *  0     3   5  first: increment to 1
  *    1   3   5  first: increment to 2
  *      2 3   5  first: reset to 0; middle: increment to 4
  *  0       4 5  first: increment to 1
  *    1     4 5  first: increment to 2
  *      2   4 5  first: increment to 3
  *        3 4 5  done
  *
  * This strategy works for dRAID but is less efficient when there are a large
  * number of child vdevs and therefore permutations to check. Furthermore,
  * since the raidz_map_t rows likely do not overlap, reconstruction would be
  * possible as long as there are no more than nparity data errors per row.
  * These additional permutations are not currently checked but could be as
  * a future improvement.
  *
  * Returns 0 on success, ECKSUM on failure.
  */
 static int
 vdev_raidz_combrec(zio_t *zio)
 {
 	int nparity = vdev_get_nparity(zio->io_vd);
 	raidz_map_t *rm = zio->io_vsd;
 	int physical_width = zio->io_vd->vdev_children;
 	int original_width = (rm->rm_original_width != 0) ?
 	    rm->rm_original_width : physical_width;
 
 	for (int i = 0; i < rm->rm_nrows; i++) {
 		raidz_row_t *rr = rm->rm_row[i];
 		int total_errors = 0;
 
 		for (int c = 0; c < rr->rr_cols; c++) {
 			if (rr->rr_col[c].rc_error)
 				total_errors++;
 		}
 
 		if (total_errors > nparity)
 			return (vdev_raidz_worst_error(rr));
 	}
 
 	for (int num_failures = 1; num_failures <= nparity; num_failures++) {
 		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
 		int *ltgts = &tstore[1]; /* value is logical child ID */
 
 
 		/*
 		 * Determine number of logical children, n.  See comment
 		 * above raidz_simulate_failure().
 		 */
 		int n = 0;
 		for (int w = physical_width;
 		    w >= original_width; w--) {
 			n += w;
 		}
 
 		ASSERT3U(num_failures, <=, nparity);
 		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
 
 		/* Handle corner cases in combrec logic */
 		ltgts[-1] = -1;
 		for (int i = 0; i < num_failures; i++) {
 			ltgts[i] = i;
 		}
 		ltgts[num_failures] = n;
 
 		for (;;) {
 			int err = raidz_reconstruct(zio, ltgts, num_failures,
 			    nparity);
 			if (err == EINVAL) {
 				/*
 				 * Reconstruction not possible with this #
 				 * failures; try more failures.
 				 */
 				break;
 			} else if (err == 0)
 				return (0);
 
 			/* Compute next targets to try */
 			for (int t = 0; ; t++) {
 				ASSERT3U(t, <, num_failures);
 				ltgts[t]++;
 				if (ltgts[t] == n) {
 					/* try more failures */
 					ASSERT3U(t, ==, num_failures - 1);
 					if (zfs_flags &
 					    ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
 						zfs_dbgmsg("reconstruction "
 						    "failed for num_failures="
 						    "%u; tried all "
 						    "combinations",
 						    num_failures);
 					}
 					break;
 				}
 
 				ASSERT3U(ltgts[t], <, n);
 				ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
 
 				/*
 				 * If that spot is available, we're done here.
 				 * Try the next combination.
 				 */
 				if (ltgts[t] != ltgts[t + 1])
 					break; // found next combination
 
 				/*
 				 * Otherwise, reset this tgt to the minimum,
 				 * and move on to the next tgt.
 				 */
 				ltgts[t] = ltgts[t - 1] + 1;
 				ASSERT3U(ltgts[t], ==, t);
 			}
 
 			/* Increase the number of failures and keep trying. */
 			if (ltgts[num_failures - 1] == n)
 				break;
 		}
 	}
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruction failed for all num_failures");
 	return (ECKSUM);
 }
 
 void
 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
 {
 	for (uint64_t row = 0; row < rm->rm_nrows; row++) {
 		raidz_row_t *rr = rm->rm_row[row];
 		vdev_raidz_reconstruct_row(rm, rr, t, nt);
 	}
 }
 
 /*
  * Complete a write IO operation on a RAIDZ VDev
  *
  * Outline:
  *   1. Check for errors on the child IOs.
  *   2. Return, setting an error code if too few child VDevs were written
  *      to reconstruct the data later.  Note that partial writes are
  *      considered successful if they can be reconstructed at all.
  */
 static void
 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
 {
 	int normal_errors = 0;
 	int shadow_errors = 0;
 
 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_error != 0) {
 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
 			normal_errors++;
 		}
 		if (rc->rc_shadow_error != 0) {
 			ASSERT(rc->rc_shadow_error != ECKSUM);
 			shadow_errors++;
 		}
 	}
 
 	/*
 	 * Treat partial writes as a success. If we couldn't write enough
 	 * columns to reconstruct the data, the I/O failed.  Otherwise, good
 	 * enough.  Note that in the case of a shadow write (during raidz
 	 * expansion), depending on if we crash, either the normal (old) or
 	 * shadow (new) location may become the "real" version of the block,
 	 * so both locations must have sufficient redundancy.
 	 *
 	 * Now that we support write reallocation, it would be better
 	 * to treat partial failure as real failure unless there are
 	 * no non-degraded top-level vdevs left, and not update DTLs
 	 * if we intend to reallocate.
 	 */
 	if (normal_errors > rr->rr_firstdatacol ||
 	    shadow_errors > rr->rr_firstdatacol) {
 		zio->io_error = zio_worst_error(zio->io_error,
 		    vdev_raidz_worst_error(rr));
 	}
 }
 
 static void
 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
     raidz_row_t *rr)
 {
 	int parity_errors = 0;
 	int parity_untried = 0;
 	int data_errors = 0;
 	int total_errors = 0;
 
 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		/*
 		 * If scrubbing and a replacing/sparing child vdev determined
 		 * that not all of its children have an identical copy of the
 		 * data, then clear the error so the column is treated like
 		 * any other read and force a repair to correct the damage.
 		 */
 		if (rc->rc_error == ECKSUM) {
 			ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
 			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
 			rc->rc_force_repair = 1;
 			rc->rc_error = 0;
 		}
 
 		if (rc->rc_error) {
 			if (c < rr->rr_firstdatacol)
 				parity_errors++;
 			else
 				data_errors++;
 
 			total_errors++;
 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
 			parity_untried++;
 		}
 	}
 
 	/*
 	 * If there were data errors and the number of errors we saw was
 	 * correctable -- less than or equal to the number of parity disks read
 	 * -- reconstruct based on the missing data.
 	 */
 	if (data_errors != 0 &&
 	    total_errors <= rr->rr_firstdatacol - parity_untried) {
 		/*
 		 * We either attempt to read all the parity columns or
 		 * none of them. If we didn't try to read parity, we
 		 * wouldn't be here in the correctable case. There must
 		 * also have been fewer parity errors than parity
 		 * columns or, again, we wouldn't be in this code path.
 		 */
 		ASSERT(parity_untried == 0);
 		ASSERT(parity_errors < rr->rr_firstdatacol);
 
 		/*
 		 * Identify the data columns that reported an error.
 		 */
 		int n = 0;
 		int tgts[VDEV_RAIDZ_MAXPARITY];
 		for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			if (rc->rc_error != 0) {
 				ASSERT(n < VDEV_RAIDZ_MAXPARITY);
 				tgts[n++] = c;
 			}
 		}
 
 		ASSERT(rr->rr_firstdatacol >= n);
 
 		vdev_raidz_reconstruct_row(rm, rr, tgts, n);
 	}
 }
 
 /*
  * Return the number of reads issued.
  */
 static int
 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
 {
 	vdev_t *vd = zio->io_vd;
 	int nread = 0;
 
 	rr->rr_missingdata = 0;
 	rr->rr_missingparity = 0;
 
 	/*
 	 * If this rows contains empty sectors which are not required
 	 * for a normal read then allocate an ABD for them now so they
 	 * may be read, verified, and any needed repairs performed.
 	 */
 	if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
 		vdev_draid_map_alloc_empty(zio, rr);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		if (rc->rc_tried || rc->rc_size == 0)
 			continue;
 
 		zio_nowait(zio_vdev_child_io(zio, NULL,
 		    vd->vdev_child[rc->rc_devidx],
 		    rc->rc_offset, rc->rc_abd, rc->rc_size,
 		    zio->io_type, zio->io_priority, 0,
 		    vdev_raidz_child_done, rc));
 		nread++;
 	}
 	return (nread);
 }
 
 /*
  * We're here because either there were too many errors to even attempt
  * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
  * failed. In either case, there is enough bad data to prevent reconstruction.
  * Start checksum ereports for all children which haven't failed.
  */
 static void
 vdev_raidz_io_done_unrecoverable(zio_t *zio)
 {
 	raidz_map_t *rm = zio->io_vsd;
 
 	for (int i = 0; i < rm->rm_nrows; i++) {
 		raidz_row_t *rr = rm->rm_row[i];
 
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
 
 			if (rc->rc_error != 0)
 				continue;
 
 			zio_bad_cksum_t zbc;
 			zbc.zbc_has_cksum = 0;
 			zbc.zbc_injected = rm->rm_ecksuminjected;
 			mutex_enter(&cvd->vdev_stat_lock);
 			cvd->vdev_stat.vs_checksum_errors++;
 			mutex_exit(&cvd->vdev_stat_lock);
 			(void) zfs_ereport_start_checksum(zio->io_spa,
 			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
 			    rc->rc_size, &zbc);
 		}
 	}
 }
 
 void
 vdev_raidz_io_done(zio_t *zio)
 {
 	raidz_map_t *rm = zio->io_vsd;
 
 	ASSERT(zio->io_bp != NULL);
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
 		}
 	} else {
 		if (rm->rm_phys_col) {
 			/*
 			 * This is an aggregated read.  Copy the data and status
 			 * from the aggregate abd's to the individual rows.
 			 */
 			for (int i = 0; i < rm->rm_nrows; i++) {
 				raidz_row_t *rr = rm->rm_row[i];
 
 				for (int c = 0; c < rr->rr_cols; c++) {
 					raidz_col_t *rc = &rr->rr_col[c];
 					if (rc->rc_tried || rc->rc_size == 0)
 						continue;
 
 					raidz_col_t *prc =
 					    &rm->rm_phys_col[rc->rc_devidx];
 					rc->rc_error = prc->rc_error;
 					rc->rc_tried = prc->rc_tried;
 					rc->rc_skipped = prc->rc_skipped;
 					if (c >= rr->rr_firstdatacol) {
 						/*
 						 * Note: this is slightly faster
 						 * than using abd_copy_off().
 						 */
 						char *physbuf = abd_to_buf(
 						    prc->rc_abd);
 						void *physloc = physbuf +
 						    rc->rc_offset -
 						    prc->rc_offset;
 
 						abd_copy_from_buf(rc->rc_abd,
 						    physloc, rc->rc_size);
 					}
 				}
 			}
 		}
 
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			vdev_raidz_io_done_reconstruct_known_missing(zio,
 			    rm, rr);
 		}
 
 		if (raidz_checksum_verify(zio) == 0) {
 			if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
 				goto done;
 
 			for (int i = 0; i < rm->rm_nrows; i++) {
 				raidz_row_t *rr = rm->rm_row[i];
 				vdev_raidz_io_done_verified(zio, rr);
 			}
 			zio_checksum_verified(zio);
 		} else {
 			/*
 			 * A sequential resilver has no checksum which makes
 			 * combinatoral reconstruction impossible. This code
 			 * path is unreachable since raidz_checksum_verify()
 			 * has no checksum to verify and must succeed.
 			 */
 			ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
 
 			/*
 			 * This isn't a typical situation -- either we got a
 			 * read error or a child silently returned bad data.
 			 * Read every block so we can try again with as much
 			 * data and parity as we can track down. If we've
 			 * already been through once before, all children will
 			 * be marked as tried so we'll proceed to combinatorial
 			 * reconstruction.
 			 */
 			int nread = 0;
 			for (int i = 0; i < rm->rm_nrows; i++) {
 				nread += vdev_raidz_read_all(zio,
 				    rm->rm_row[i]);
 			}
 			if (nread != 0) {
 				/*
 				 * Normally our stage is VDEV_IO_DONE, but if
 				 * we've already called redone(), it will have
 				 * changed to VDEV_IO_START, in which case we
 				 * don't want to call redone() again.
 				 */
 				if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
 					zio_vdev_io_redone(zio);
 				return;
 			}
 			/*
 			 * It would be too expensive to try every possible
 			 * combination of failed sectors in every row, so
 			 * instead we try every combination of failed current or
 			 * past physical disk. This means that if the incorrect
 			 * sectors were all on Nparity disks at any point in the
 			 * past, we will find the correct data.  The only known
 			 * case where this is less durable than a non-expanded
 			 * RAIDZ, is if we have a silent failure during
 			 * expansion.  In that case, one block could be
 			 * partially in the old format and partially in the
 			 * new format, so we'd lost some sectors from the old
 			 * format and some from the new format.
 			 *
 			 * e.g. logical_width=4 physical_width=6
 			 * the 15 (6+5+4) possible failed disks are:
 			 * width=6 child=0
 			 * width=6 child=1
 			 * width=6 child=2
 			 * width=6 child=3
 			 * width=6 child=4
 			 * width=6 child=5
 			 * width=5 child=0
 			 * width=5 child=1
 			 * width=5 child=2
 			 * width=5 child=3
 			 * width=5 child=4
 			 * width=4 child=0
 			 * width=4 child=1
 			 * width=4 child=2
 			 * width=4 child=3
 			 * And we will try every combination of Nparity of these
 			 * failing.
 			 *
 			 * As a first pass, we can generate every combo,
 			 * and try reconstructing, ignoring any known
 			 * failures.  If any row has too many known + simulated
 			 * failures, then we bail on reconstructing with this
 			 * number of simulated failures.  As an improvement,
 			 * we could detect the number of whole known failures
 			 * (i.e. we have known failures on these disks for
 			 * every row; the disks never succeeded), and
 			 * subtract that from the max # failures to simulate.
 			 * We could go even further like the current
 			 * combrec code, but that doesn't seem like it
 			 * gains us very much.  If we simulate a failure
 			 * that is also a known failure, that's fine.
 			 */
 			zio->io_error = vdev_raidz_combrec(zio);
 			if (zio->io_error == ECKSUM &&
 			    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 				vdev_raidz_io_done_unrecoverable(zio);
 			}
 		}
 	}
 done:
 	if (rm->rm_lr != NULL) {
 		zfs_rangelock_exit(rm->rm_lr);
 		rm->rm_lr = NULL;
 	}
 }
 
 static void
 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	if (faulted > vdrz->vd_nparity)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_NO_REPLICAS);
 	else if (degraded + faulted != 0)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	else
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
 /*
  * Determine if any portion of the provided block resides on a child vdev
  * with a dirty DTL and therefore needs to be resilvered.  The function
  * assumes that at least one DTL is dirty which implies that full stripe
  * width blocks must be resilvered.
  */
 static boolean_t
 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 
 	/*
 	 * If we're in the middle of a RAIDZ expansion, this block may be in
 	 * the old and/or new location.  For simplicity, always resilver it.
 	 */
 	if (vdrz->vn_vre.vre_state == DSS_SCANNING)
 		return (B_TRUE);
 
 	uint64_t dcols = vd->vdev_children;
 	uint64_t nparity = vdrz->vd_nparity;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	/* The starting RAIDZ (parent) vdev sector of the block. */
 	uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
 	/* The zio's size in units of the vdev's minimum sector size. */
 	uint64_t s = ((psize - 1) >> ashift) + 1;
 	/* The first column for this stripe. */
 	uint64_t f = b % dcols;
 
 	/* Unreachable by sequential resilver. */
 	ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
 
 	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
 		return (B_FALSE);
 
 	if (s + nparity >= dcols)
 		return (B_TRUE);
 
 	for (uint64_t c = 0; c < s + nparity; c++) {
 		uint64_t devidx = (f + c) % dcols;
 		vdev_t *cvd = vd->vdev_child[devidx];
 
 		/*
 		 * dsl_scan_need_resilver() already checked vd with
 		 * vdev_dtl_contains(). So here just check cvd with
 		 * vdev_dtl_empty(), cheaper and a good approximation.
 		 */
 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 static void
-vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
-    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
+vdev_raidz_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs,
+    zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
 {
 	(void) remain_rs;
 
 	vdev_t *raidvd = cvd->vdev_parent;
 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
 
 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
 
 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
 		/*
 		 * We're in the middle of expansion, in which case the
 		 * translation is in flux.  Any answer we give may be wrong
 		 * by the time we return, so it isn't safe for the caller to
 		 * act on it.  Therefore we say that this range isn't present
 		 * on any children.  The only consumers of this are "zpool
 		 * initialize" and trimming, both of which are "best effort"
 		 * anyway.
 		 */
 		physical_rs->rs_start = physical_rs->rs_end = 0;
 		remain_rs->rs_start = remain_rs->rs_end = 0;
 		return;
 	}
 
 	uint64_t width = vdrz->vd_physical_width;
 	uint64_t tgt_col = cvd->vdev_id;
 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
 
 	/* make sure the offsets are block-aligned */
 	ASSERT0(logical_rs->rs_start % (1 << ashift));
 	ASSERT0(logical_rs->rs_end % (1 << ashift));
 	uint64_t b_start = logical_rs->rs_start >> ashift;
 	uint64_t b_end = logical_rs->rs_end >> ashift;
 
 	uint64_t start_row = 0;
 	if (b_start > tgt_col) /* avoid underflow */
 		start_row = ((b_start - tgt_col - 1) / width) + 1;
 
 	uint64_t end_row = 0;
 	if (b_end > tgt_col)
 		end_row = ((b_end - tgt_col - 1) / width) + 1;
 
 	physical_rs->rs_start = start_row << ashift;
 	physical_rs->rs_end = end_row << ashift;
 
 	ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
 	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
 	    logical_rs->rs_end - logical_rs->rs_start);
 }
 
 static void
 raidz_reflow_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = arg;
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 
 	/*
 	 * Ensure there are no i/os to the range that is being committed.
 	 */
 	uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
 	ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
 
 	mutex_enter(&vre->vre_lock);
 	uint64_t new_offset =
 	    MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
 	/*
 	 * We should not have committed anything that failed.
 	 */
 	VERIFY3U(vre->vre_failed_offset, >=, old_offset);
 	mutex_exit(&vre->vre_lock);
 
 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
 	    old_offset, new_offset - old_offset,
 	    RL_WRITER);
 
 	/*
 	 * Update the uberblock that will be written when this txg completes.
 	 */
 	RAIDZ_REFLOW_SET(&spa->spa_uberblock,
 	    RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
 	vre->vre_offset_pertxg[txgoff] = 0;
 	zfs_rangelock_exit(lr);
 
 	mutex_enter(&vre->vre_lock);
 	vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
 	vre->vre_bytes_copied_pertxg[txgoff] = 0;
 	mutex_exit(&vre->vre_lock);
 
 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
 	    sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
 }
 
 static void
 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = arg;
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
 
 	for (int i = 0; i < TXG_SIZE; i++)
 		VERIFY0(vre->vre_offset_pertxg[i]);
 
 	reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
 	re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
 	re->re_logical_width = vdrz->vd_physical_width;
 	mutex_enter(&vdrz->vd_expand_lock);
 	avl_add(&vdrz->vd_expand_txgs, re);
 	mutex_exit(&vdrz->vd_expand_lock);
 
 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
 
 	/*
 	 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
 	 * will get written (based on vd_expand_txgs).
 	 */
 	vdev_config_dirty(vd);
 
 	/*
 	 * Before we change vre_state, the on-disk state must reflect that we
 	 * have completed all copying, so that vdev_raidz_io_start() can use
 	 * vre_state to determine if the reflow is in progress.  See also the
 	 * end of spa_raidz_expand_thread().
 	 */
 	VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
 	    raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
 
 	vre->vre_end_time = gethrestime_sec();
 	vre->vre_state = DSS_FINISHED;
 
 	uint64_t state = vre->vre_state;
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
 	    sizeof (state), 1, &state, tx));
 
 	uint64_t end_time = vre->vre_end_time;
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
 	    sizeof (end_time), 1, &end_time, tx));
 
 	spa->spa_uberblock.ub_raidz_reflow_info = 0;
 
 	spa_history_log_internal(spa, "raidz vdev expansion completed",  tx,
 	    "%s vdev %llu new width %llu", spa_name(spa),
 	    (unsigned long long)vd->vdev_id,
 	    (unsigned long long)vd->vdev_children);
 
 	spa->spa_raidz_expand = NULL;
 	raidvd->vdev_rz_expanding = B_FALSE;
 
 	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
 	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
 	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
 
 	spa_notify_waiters(spa);
 
 	/*
 	 * While we're in syncing context take the opportunity to
 	 * setup a scrub. All the data has been sucessfully copied
 	 * but we have not validated any checksums.
 	 */
 	setup_sync_arg_t setup_sync_arg = {
 		.func = POOL_SCAN_SCRUB,
 		.txgstart = 0,
 		.txgend = 0,
 	};
 	if (zfs_scrub_after_expand &&
 	    dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) {
 		dsl_scan_setup_sync(&setup_sync_arg, tx);
 	}
 }
 
 /*
  * State of one copy batch.
  */
 typedef struct raidz_reflow_arg {
 	vdev_raidz_expand_t *rra_vre;	/* Global expantion state. */
 	zfs_locked_range_t *rra_lr;	/* Range lock of this batch. */
 	uint64_t rra_txg;	/* TXG of this batch. */
 	uint_t rra_ashift;	/* Ashift of the vdev. */
 	uint32_t rra_tbd;	/* Number of in-flight ZIOs. */
 	uint32_t rra_writes;	/* Number of write ZIOs. */
 	zio_t *rra_zio[];	/* Write ZIO pointers. */
 } raidz_reflow_arg_t;
 
 /*
  * Write of the new location on one child is done.  Once all of them are done
  * we can unlock and free everything.
  */
 static void
 raidz_reflow_write_done(zio_t *zio)
 {
 	raidz_reflow_arg_t *rra = zio->io_private;
 	vdev_raidz_expand_t *vre = rra->rra_vre;
 
 	abd_free(zio->io_abd);
 
 	mutex_enter(&vre->vre_lock);
 	if (zio->io_error != 0) {
 		/* Force a reflow pause on errors */
 		vre->vre_failed_offset =
 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
 	}
 	ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
 	vre->vre_outstanding_bytes -= zio->io_size;
 	if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
 	    vre->vre_failed_offset) {
 		vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
 		    zio->io_size;
 	}
 	cv_signal(&vre->vre_cv);
 	boolean_t done = (--rra->rra_tbd == 0);
 	mutex_exit(&vre->vre_lock);
 
 	if (!done)
 		return;
 	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
 	zfs_rangelock_exit(rra->rra_lr);
 	kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes);
 }
 
 /*
  * Read of the old location on one child is done.  Once all of them are done
  * writes should have all the data and we can issue them.
  */
 static void
 raidz_reflow_read_done(zio_t *zio)
 {
 	raidz_reflow_arg_t *rra = zio->io_private;
 	vdev_raidz_expand_t *vre = rra->rra_vre;
 
 	/* Reads of only one block use write ABDs.  For bigger free gangs. */
 	if (zio->io_size > (1 << rra->rra_ashift))
 		abd_free(zio->io_abd);
 
 	/*
 	 * If the read failed, or if it was done on a vdev that is not fully
 	 * healthy (e.g. a child that has a resilver in progress), we may not
 	 * have the correct data.  Note that it's OK if the write proceeds.
 	 * It may write garbage but the location is otherwise unused and we
 	 * will retry later due to vre_failed_offset.
 	 */
 	if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
 		zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
 		    "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
 		    (long long)rra->rra_lr->lr_offset,
 		    (long long)rra->rra_lr->lr_length,
 		    (long long)rra->rra_txg,
 		    zio->io_error,
 		    vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
 		    vdev_dtl_empty(zio->io_vd, DTL_MISSING));
 		mutex_enter(&vre->vre_lock);
 		/* Force a reflow pause on errors */
 		vre->vre_failed_offset =
 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
 		mutex_exit(&vre->vre_lock);
 	}
 
 	if (atomic_dec_32_nv(&rra->rra_tbd) > 0)
 		return;
 	uint32_t writes = rra->rra_tbd = rra->rra_writes;
 	for (uint64_t i = 0; i < writes; i++)
 		zio_nowait(rra->rra_zio[i]);
 }
 
 static void
 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
     dmu_tx_t *tx)
 {
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	if (offset == 0)
 		return;
 
 	mutex_enter(&vre->vre_lock);
 	ASSERT3U(vre->vre_offset, <=, offset);
 	vre->vre_offset = offset;
 	mutex_exit(&vre->vre_lock);
 
 	if (vre->vre_offset_pertxg[txgoff] == 0) {
 		dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
 		    spa, tx);
 	}
 	vre->vre_offset_pertxg[txgoff] = offset;
 }
 
 static boolean_t
 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
 {
 	for (int i = 0; i < raidz_vd->vdev_children; i++) {
 		/* Quick check if a child is being replaced */
 		if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static boolean_t
 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, zfs_range_tree_t *rt,
     dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint_t ashift = vd->vdev_top->vdev_ashift;
 
 	zfs_range_seg_t *rs = zfs_range_tree_first(rt);
 	if (rt == NULL)
 		return (B_FALSE);
 	uint64_t offset = zfs_rs_get_start(rs, rt);
 	ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
 	uint64_t size = zfs_rs_get_end(rs, rt) - offset;
 	ASSERT3U(size, >=, 1 << ashift);
 	ASSERT(IS_P2ALIGNED(size, 1 << ashift));
 
 	uint64_t blkid = offset >> ashift;
 	uint_t old_children = vd->vdev_children - 1;
 
 	/*
 	 * We can only progress to the point that writes will not overlap
 	 * with blocks whose progress has not yet been recorded on disk.
 	 * Since partially-copied rows are still read from the old location,
 	 * we need to stop one row before the sector-wise overlap, to prevent
 	 * row-wise overlap.
 	 *
 	 * Note that even if we are skipping over a large unallocated region,
 	 * we can't move the on-disk progress to `offset`, because concurrent
 	 * writes/allocations could still use the currently-unallocated
 	 * region.
 	 */
 	uint64_t ubsync_blkid =
 	    RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
 	uint64_t next_overwrite_blkid = ubsync_blkid +
 	    ubsync_blkid / old_children - old_children;
 	VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
 	if (blkid >= next_overwrite_blkid) {
 		raidz_reflow_record_progress(vre,
 		    next_overwrite_blkid << ashift, tx);
 		return (B_TRUE);
 	}
 
 	size = MIN(size, raidz_expand_max_copy_bytes);
 	size = MIN(size, (uint64_t)old_children *
 	    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE));
 	size = MAX(size, 1 << ashift);
 	uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid);
 	size = (uint64_t)blocks << ashift;
 
 	zfs_range_tree_remove(rt, offset, size);
 
 	uint_t reads = MIN(blocks, old_children);
 	uint_t writes = MIN(blocks, vd->vdev_children);
 	raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) +
 	    sizeof (zio_t *) * writes, KM_SLEEP);
 	rra->rra_vre = vre;
 	rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
 	    offset, size, RL_WRITER);
 	rra->rra_txg = dmu_tx_get_txg(tx);
 	rra->rra_ashift = ashift;
 	rra->rra_tbd = reads;
 	rra->rra_writes = writes;
 
 	raidz_reflow_record_progress(vre, offset + size, tx);
 
 	/*
 	 * SCL_STATE will be released when the read and write are done,
 	 * by raidz_reflow_write_done().
 	 */
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 
 	/* check if a replacing vdev was added, if so treat it as an error */
 	if (vdev_raidz_expand_child_replacing(vd)) {
 		zfs_dbgmsg("replacing vdev encountered, reflow paused at "
 		    "offset=%llu txg=%llu",
 		    (long long)rra->rra_lr->lr_offset,
 		    (long long)rra->rra_txg);
 
 		mutex_enter(&vre->vre_lock);
 		vre->vre_failed_offset =
 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
 		cv_signal(&vre->vre_cv);
 		mutex_exit(&vre->vre_lock);
 
 		/* drop everything we acquired */
 		spa_config_exit(spa, SCL_STATE, spa);
 		zfs_rangelock_exit(rra->rra_lr);
 		kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes);
 		return (B_TRUE);
 	}
 
 	mutex_enter(&vre->vre_lock);
 	vre->vre_outstanding_bytes += size;
 	mutex_exit(&vre->vre_lock);
 
 	/* Allocate ABD and ZIO for each child we write. */
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	zio_t *pio = spa->spa_txg_zio[txgoff];
 	uint_t b = blocks / vd->vdev_children;
 	uint_t bb = blocks % vd->vdev_children;
 	for (uint_t i = 0; i < writes; i++) {
 		uint_t n = b + (i < bb);
 		abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE);
 		rra->rra_zio[i] = zio_vdev_child_io(pio, NULL,
 		    vd->vdev_child[(blkid + i) % vd->vdev_children],
 		    ((blkid + i) / vd->vdev_children) << ashift,
 		    abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
 		    ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra);
 	}
 
 	/*
 	 * Allocate and issue ZIO for each child we read.  For reads of only
 	 * one block we can use respective writer ABDs, since they will also
 	 * have only one block.  For bigger reads create gang ABDs and fill
 	 * them with respective blocks from writer ABDs.
 	 */
 	b = blocks / old_children;
 	bb = blocks % old_children;
 	for (uint_t i = 0; i < reads; i++) {
 		uint_t n = b + (i < bb);
 		abd_t *abd;
 		if (n > 1) {
 			abd = abd_alloc_gang();
 			for (uint_t j = 0; j < n; j++) {
 				uint_t b = j * old_children + i;
 				abd_t *cabd = abd_get_offset_size(
 				    rra->rra_zio[b % vd->vdev_children]->io_abd,
 				    (b / vd->vdev_children) << ashift,
 				    1 << ashift);
 				abd_gang_add(abd, cabd, B_TRUE);
 			}
 		} else {
 			abd = rra->rra_zio[i]->io_abd;
 		}
 		zio_nowait(zio_vdev_child_io(pio, NULL,
 		    vd->vdev_child[(blkid + i) % old_children],
 		    ((blkid + i) / old_children) << ashift, abd,
 		    n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
 		    ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra));
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * For testing (ztest specific)
  */
 static void
 raidz_expand_pause(uint_t pause_point)
 {
 	while (raidz_expand_pause_point != 0 &&
 	    raidz_expand_pause_point <= pause_point)
 		delay(hz);
 }
 
 static void
 raidz_scratch_child_done(zio_t *zio)
 {
 	zio_t *pio = zio->io_private;
 
 	mutex_enter(&pio->io_lock);
 	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
 	mutex_exit(&pio->io_lock);
 }
 
 /*
  * Reflow the beginning portion of the vdev into an intermediate scratch area
  * in memory and on disk. This operation must be persisted on disk before we
  * proceed to overwrite the beginning portion with the reflowed data.
  *
  * This multi-step task can fail to complete if disk errors are encountered
  * and we can return here after a pause (waiting for disk to become healthy).
  */
 static void
 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
 {
 	vdev_raidz_expand_t *vre = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	zio_t *pio;
 	int error;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	int ashift = raidvd->vdev_ashift;
 	uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
 	    uint64_t);
 	uint64_t logical_size = write_size * raidvd->vdev_children;
 	uint64_t read_size =
 	    P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
 	    1 << ashift);
 
 	/*
 	 * The scratch space must be large enough to get us to the point
 	 * that one row does not overlap itself when moved.  This is checked
 	 * by vdev_raidz_attach_check().
 	 */
 	VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
 	VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
 	VERIFY3U(write_size, <=, read_size);
 
 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
 	    0, logical_size, RL_WRITER);
 
 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
 	    KM_SLEEP);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		abds[i] = abd_alloc_linear(read_size, B_FALSE);
 	}
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
 
 	/*
 	 * If we have already written the scratch area then we must read from
 	 * there, since new writes were redirected there while we were paused
 	 * or the original location may have been partially overwritten with
 	 * reflowed data.
 	 */
 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
 		VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
 		/*
 		 * Read from scratch space.
 		 */
 		pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 		for (int i = 0; i < raidvd->vdev_children; i++) {
 			/*
 			 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
 			 * to the offset to calculate the physical offset to
 			 * write to.  Passing in a negative offset makes us
 			 * access the scratch area.
 			 */
 			zio_nowait(zio_vdev_child_io(pio, NULL,
 			    raidvd->vdev_child[i],
 			    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
 			    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
 			    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
 		}
 		error = zio_wait(pio);
 		if (error != 0) {
 			zfs_dbgmsg("reflow: error %d reading scratch location",
 			    error);
 			goto io_error_exit;
 		}
 		goto overwrite;
 	}
 
 	/*
 	 * Read from original location.
 	 */
 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	for (int i = 0; i < raidvd->vdev_children - 1; i++) {
 		ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    0, abds[i], read_size, ZIO_TYPE_READ,
 		    ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
 		    raidz_scratch_child_done, pio));
 	}
 	error = zio_wait(pio);
 	if (error != 0) {
 		zfs_dbgmsg("reflow: error %d reading original location", error);
 io_error_exit:
 		for (int i = 0; i < raidvd->vdev_children; i++)
 			abd_free(abds[i]);
 		kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
 		zfs_rangelock_exit(lr);
 		spa_config_exit(spa, SCL_STATE, FTAG);
 		return;
 	}
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
 
 	/*
 	 * Reflow in memory.
 	 */
 	uint64_t logical_sectors = logical_size >> ashift;
 	for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
 		int oldchild = i % (raidvd->vdev_children - 1);
 		uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
 
 		int newchild = i % raidvd->vdev_children;
 		uint64_t newoff = (i / raidvd->vdev_children) << ashift;
 
 		/* a single sector should not be copying over itself */
 		ASSERT(!(newchild == oldchild && newoff == oldoff));
 
 		abd_copy_off(abds[newchild], abds[oldchild],
 		    newoff, oldoff, 1 << ashift);
 	}
 
 	/*
 	 * Verify that we filled in everything we intended to (write_size on
 	 * each child).
 	 */
 	VERIFY0(logical_sectors % raidvd->vdev_children);
 	VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
 	    write_size);
 
 	/*
 	 * Write to scratch location (boot area).
 	 */
 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		/*
 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
 		 * the offset to calculate the physical offset to write to.
 		 * Passing in a negative offset lets us access the boot area.
 		 */
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
 		    write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
 		    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
 	}
 	error = zio_wait(pio);
 	if (error != 0) {
 		zfs_dbgmsg("reflow: error %d writing scratch location", error);
 		goto io_error_exit;
 	}
 	pio = zio_root(spa, NULL, NULL, 0);
 	zio_flush(pio, raidvd);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
 	    (long long)logical_size);
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
 
 	/*
 	 * Update uberblock to indicate that scratch space is valid.  This is
 	 * needed because after this point, the real location may be
 	 * overwritten.  If we crash, we need to get the data from the
 	 * scratch space, rather than the real location.
 	 *
 	 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
 	 * will prefer this uberblock.
 	 */
 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
 	spa->spa_ubsync.ub_timestamp++;
 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
 	if (spa_multihost(spa))
 		mmp_update_uberblock(spa, &spa->spa_ubsync);
 
 	zfs_dbgmsg("reflow: uberblock updated "
 	    "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
 	    (long long)spa->spa_ubsync.ub_txg,
 	    (long long)logical_size,
 	    (long long)spa->spa_ubsync.ub_timestamp);
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
 
 	/*
 	 * Overwrite with reflow'ed data.
 	 */
 overwrite:
 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
 		    ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
 		    raidz_scratch_child_done, pio));
 	}
 	error = zio_wait(pio);
 	if (error != 0) {
 		/*
 		 * When we exit early here and drop the range lock, new
 		 * writes will go into the scratch area so we'll need to
 		 * read from there when we return after pausing.
 		 */
 		zfs_dbgmsg("reflow: error %d writing real location", error);
 		/*
 		 * Update the uberblock that is written when this txg completes.
 		 */
 		RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
 		    logical_size);
 		goto io_error_exit;
 	}
 	pio = zio_root(spa, NULL, NULL, 0);
 	zio_flush(pio, raidvd);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
 	    (long long)logical_size);
 	for (int i = 0; i < raidvd->vdev_children; i++)
 		abd_free(abds[i]);
 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
 
 	/*
 	 * Update uberblock to indicate that the initial part has been
 	 * reflow'ed.  This is needed because after this point (when we exit
 	 * the rangelock), we allow regular writes to this region, which will
 	 * be written to the new location only (because reflow_offset_next ==
 	 * reflow_offset_synced).  If we crashed and re-copied from the
 	 * scratch space, we would lose the regular writes.
 	 */
 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
 	    logical_size);
 	spa->spa_ubsync.ub_timestamp++;
 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
 	if (spa_multihost(spa))
 		mmp_update_uberblock(spa, &spa->spa_ubsync);
 
 	zfs_dbgmsg("reflow: uberblock updated "
 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
 	    (long long)spa->spa_ubsync.ub_txg,
 	    (long long)logical_size,
 	    (long long)spa->spa_ubsync.ub_timestamp);
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
 
 	/*
 	 * Update progress.
 	 */
 	vre->vre_offset = logical_size;
 	zfs_rangelock_exit(lr);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
 	/*
 	 * Note - raidz_reflow_sync() will update the uberblock state to
 	 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
 	 */
 	raidz_reflow_sync(spa, tx);
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
 }
 
 /*
  * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
  * here.  No other i/o can be in progress, so we don't need the vre_rangelock.
  */
 void
 vdev_raidz_reflow_copy_scratch(spa_t *spa)
 {
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 	uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
 	ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	ASSERT0(logical_size % raidvd->vdev_children);
 	uint64_t write_size = logical_size / raidvd->vdev_children;
 
 	zio_t *pio;
 
 	/*
 	 * Read from scratch space.
 	 */
 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
 	    KM_SLEEP);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		abds[i] = abd_alloc_linear(write_size, B_FALSE);
 	}
 
 	pio = zio_root(spa, NULL, NULL, 0);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		/*
 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
 		 * the offset to calculate the physical offset to write to.
 		 * Passing in a negative offset lets us access the boot area.
 		 */
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
 		    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0,
 		    raidz_scratch_child_done, pio));
 	}
 	zio_wait(pio);
 
 	/*
 	 * Overwrite real location with reflow'ed data.
 	 */
 	pio = zio_root(spa, NULL, NULL, 0);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
 		    ZIO_PRIORITY_REMOVAL, 0,
 		    raidz_scratch_child_done, pio));
 	}
 	zio_wait(pio);
 	pio = zio_root(spa, NULL, NULL, 0);
 	zio_flush(pio, raidvd);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
 	    "to real location", (long long)logical_size);
 
 	for (int i = 0; i < raidvd->vdev_children; i++)
 		abd_free(abds[i]);
 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
 
 	/*
 	 * Update uberblock.
 	 */
 	RAIDZ_REFLOW_SET(&spa->spa_ubsync,
 	    RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
 	spa->spa_ubsync.ub_timestamp++;
 	VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
 	if (spa_multihost(spa))
 		mmp_update_uberblock(spa, &spa->spa_ubsync);
 
 	zfs_dbgmsg("reflow recovery: uberblock updated "
 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
 	    (long long)spa->spa_ubsync.ub_txg,
 	    (long long)logical_size,
 	    (long long)spa->spa_ubsync.ub_timestamp);
 
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
 	    spa_first_txg(spa));
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	vre->vre_offset = logical_size;
 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
 	/*
 	 * Note that raidz_reflow_sync() will update the uberblock once more
 	 */
 	raidz_reflow_sync(spa, tx);
 
 	dmu_tx_commit(tx);
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 }
 
 static boolean_t
 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
 {
 	(void) zthr;
 	spa_t *spa = arg;
 
 	return (spa->spa_raidz_expand != NULL &&
 	    !spa->spa_raidz_expand->vre_waiting_for_resilver);
 }
 
 /*
  * RAIDZ expansion background thread
  *
  * Can be called multiple times if the reflow is paused
  */
 static void
 spa_raidz_expand_thread(void *arg, zthr_t *zthr)
 {
 	spa_t *spa = arg;
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 
 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
 		vre->vre_offset = 0;
 	else
 		vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
 
 	/* Reflow the begining portion using the scratch area */
 	if (vre->vre_offset == 0) {
 		VERIFY0(dsl_sync_task(spa_name(spa),
 		    NULL, raidz_reflow_scratch_sync,
 		    vre, 0, ZFS_SPACE_CHECK_NONE));
 
 		/* if we encountered errors then pause */
 		if (vre->vre_offset == 0) {
 			mutex_enter(&vre->vre_lock);
 			vre->vre_waiting_for_resilver = B_TRUE;
 			mutex_exit(&vre->vre_lock);
 			return;
 		}
 	}
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 
 	uint64_t guid = raidvd->vdev_guid;
 
 	/* Iterate over all the remaining metaslabs */
 	for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
 	    i < raidvd->vdev_ms_count &&
 	    !zthr_iscancelled(zthr) &&
 	    vre->vre_failed_offset == UINT64_MAX; i++) {
 		metaslab_t *msp = raidvd->vdev_ms[i];
 
 		metaslab_disable(msp);
 		mutex_enter(&msp->ms_lock);
 
 		/*
 		 * The metaslab may be newly created (for the expanded
 		 * space), in which case its trees won't exist yet,
 		 * so we need to bail out early.
 		 */
 		if (msp->ms_new) {
 			mutex_exit(&msp->ms_lock);
 			metaslab_enable(msp, B_FALSE, B_FALSE);
 			continue;
 		}
 
 		VERIFY0(metaslab_load(msp));
 
 		/*
 		 * We want to copy everything except the free (allocatable)
 		 * space.  Note that there may be a little bit more free
 		 * space (e.g. in ms_defer), and it's fine to copy that too.
 		 */
 		uint64_t shift, start;
 		zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(
 		    raidvd, msp, &start, &shift);
 		zfs_range_tree_t *rt = zfs_range_tree_create(NULL, type, NULL,
 		    start, shift);
 		zfs_range_tree_add(rt, msp->ms_start, msp->ms_size);
 		zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove,
 		    rt);
 		mutex_exit(&msp->ms_lock);
 
 		/*
 		 * Force the last sector of each metaslab to be copied.  This
 		 * ensures that we advance the on-disk progress to the end of
 		 * this metaslab while the metaslab is disabled.  Otherwise, we
 		 * could move past this metaslab without advancing the on-disk
 		 * progress, and then an allocation to this metaslab would not
 		 * be copied.
 		 */
 		int sectorsz = 1 << raidvd->vdev_ashift;
 		uint64_t ms_last_offset = msp->ms_start +
 		    msp->ms_size - sectorsz;
 		if (!zfs_range_tree_contains(rt, ms_last_offset, sectorsz)) {
 			zfs_range_tree_add(rt, ms_last_offset, sectorsz);
 		}
 
 		/*
 		 * When we are resuming from a paused expansion (i.e.
 		 * when importing a pool with a expansion in progress),
 		 * discard any state that we have already processed.
 		 */
 		if (vre->vre_offset > msp->ms_start) {
 			zfs_range_tree_clear(rt, msp->ms_start,
 			    vre->vre_offset - msp->ms_start);
 		}
 
 		while (!zthr_iscancelled(zthr) &&
 		    !zfs_range_tree_is_empty(rt) &&
 		    vre->vre_failed_offset == UINT64_MAX) {
 
 			/*
 			 * We need to periodically drop the config lock so that
 			 * writers can get in.  Additionally, we can't wait
 			 * for a txg to sync while holding a config lock
 			 * (since a waiting writer could cause a 3-way deadlock
 			 * with the sync thread, which also gets a config
 			 * lock for reader).  So we can't hold the config lock
 			 * while calling dmu_tx_assign().
 			 */
 			spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 			/*
 			 * If requested, pause the reflow when the amount
 			 * specified by raidz_expand_max_reflow_bytes is reached
 			 *
 			 * This pause is only used during testing or debugging.
 			 */
 			while (raidz_expand_max_reflow_bytes != 0 &&
 			    raidz_expand_max_reflow_bytes <=
 			    vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
 				delay(hz);
 			}
 
 			mutex_enter(&vre->vre_lock);
 			while (vre->vre_outstanding_bytes >
 			    raidz_expand_max_copy_bytes) {
 				cv_wait(&vre->vre_cv, &vre->vre_lock);
 			}
 			mutex_exit(&vre->vre_lock);
 
 			dmu_tx_t *tx =
 			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 
 			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 			uint64_t txg = dmu_tx_get_txg(tx);
 
 			/*
 			 * Reacquire the vdev_config lock.  Theoretically, the
 			 * vdev_t that we're expanding may have changed.
 			 */
 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 			raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 
 			boolean_t needsync =
 			    raidz_reflow_impl(raidvd, vre, rt, tx);
 
 			dmu_tx_commit(tx);
 
 			if (needsync) {
 				spa_config_exit(spa, SCL_CONFIG, FTAG);
 				txg_wait_synced(spa->spa_dsl_pool, txg);
 				spa_config_enter(spa, SCL_CONFIG, FTAG,
 				    RW_READER);
 			}
 		}
 
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 		metaslab_enable(msp, B_FALSE, B_FALSE);
 		zfs_range_tree_vacate(rt, NULL, NULL);
 		zfs_range_tree_destroy(rt);
 
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	}
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	/*
 	 * The txg_wait_synced() here ensures that all reflow zio's have
 	 * completed, and vre_failed_offset has been set if necessary.  It
 	 * also ensures that the progress of the last raidz_reflow_sync() is
 	 * written to disk before raidz_reflow_complete_sync() changes the
 	 * in-memory vre_state.  vdev_raidz_io_start() uses vre_state to
 	 * determine if a reflow is in progress, in which case we may need to
 	 * write to both old and new locations.  Therefore we can only change
 	 * vre_state once this is not necessary, which is once the on-disk
 	 * progress (in spa_ubsync) has been set past any possible writes (to
 	 * the end of the last metaslab).
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 
 	if (!zthr_iscancelled(zthr) &&
 	    vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
 		/*
 		 * We are not being canceled or paused, so the reflow must be
 		 * complete. In that case also mark it as completed on disk.
 		 */
 		ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
 		VERIFY0(dsl_sync_task(spa_name(spa), NULL,
 		    raidz_reflow_complete_sync, spa,
 		    0, ZFS_SPACE_CHECK_NONE));
 		(void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
 	} else {
 		/*
 		 * Wait for all copy zio's to complete and for all the
 		 * raidz_reflow_sync() synctasks to be run.
 		 */
 		spa_history_log_internal(spa, "reflow pause",
 		    NULL, "offset=%llu failed_offset=%lld",
 		    (long long)vre->vre_offset,
 		    (long long)vre->vre_failed_offset);
 		mutex_enter(&vre->vre_lock);
 		if (vre->vre_failed_offset != UINT64_MAX) {
 			/*
 			 * Reset progress so that we will retry everything
 			 * after the point that something failed.
 			 */
 			vre->vre_offset = vre->vre_failed_offset;
 			vre->vre_failed_offset = UINT64_MAX;
 			vre->vre_waiting_for_resilver = B_TRUE;
 		}
 		mutex_exit(&vre->vre_lock);
 	}
 }
 
 void
 spa_start_raidz_expansion_thread(spa_t *spa)
 {
 	ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
 	spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
 	    spa_raidz_expand_thread_check, spa_raidz_expand_thread,
 	    spa, defclsyspri);
 }
 
 void
 raidz_dtl_reassessed(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	if (spa->spa_raidz_expand != NULL) {
 		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 		/*
 		 * we get called often from vdev_dtl_reassess() so make
 		 * sure it's our vdev and any replacing is complete
 		 */
 		if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
 		    !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
 			mutex_enter(&vre->vre_lock);
 			if (vre->vre_waiting_for_resilver) {
 				vdev_dbgmsg(vd, "DTL reassessed, "
 				    "continuing raidz expansion");
 				vre->vre_waiting_for_resilver = B_FALSE;
 				zthr_wakeup(spa->spa_raidz_expand_zthr);
 			}
 			mutex_exit(&vre->vre_lock);
 		}
 	}
 }
 
 int
 vdev_raidz_attach_check(vdev_t *new_child)
 {
 	vdev_t *raidvd = new_child->vdev_parent;
 	uint64_t new_children = raidvd->vdev_children;
 
 	/*
 	 * We use the "boot" space as scratch space to handle overwriting the
 	 * initial part of the vdev.  If it is too small, then this expansion
 	 * is not allowed.  This would be very unusual (e.g. ashift > 13 and
 	 * >200 children).
 	 */
 	if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
 		return (EINVAL);
 	}
 	return (0);
 }
 
 void
 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
 {
 	vdev_t *new_child = arg;
 	spa_t *spa = new_child->vdev_spa;
 	vdev_t *raidvd = new_child->vdev_parent;
 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
 	ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
 	ASSERT3P(raidvd->vdev_top, ==, raidvd);
 	ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
 	ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
 	ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
 	    new_child);
 
 	spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
 
 	vdrz->vd_physical_width++;
 
 	VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
 	vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
 	vdrz->vn_vre.vre_offset = 0;
 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
 	spa->spa_raidz_expand = &vdrz->vn_vre;
 	zthr_wakeup(spa->spa_raidz_expand_zthr);
 
 	/*
 	 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
 	 * written to the config.
 	 */
 	vdev_config_dirty(raidvd);
 
 	vdrz->vn_vre.vre_start_time = gethrestime_sec();
 	vdrz->vn_vre.vre_end_time = 0;
 	vdrz->vn_vre.vre_state = DSS_SCANNING;
 	vdrz->vn_vre.vre_bytes_copied = 0;
 
 	uint64_t state = vdrz->vn_vre.vre_state;
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
 	    sizeof (state), 1, &state, tx));
 
 	uint64_t start_time = vdrz->vn_vre.vre_start_time;
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
 	    sizeof (start_time), 1, &start_time, tx));
 
 	(void) zap_remove(spa->spa_meta_objset,
 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
 	(void) zap_remove(spa->spa_meta_objset,
 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
 
 	spa_history_log_internal(spa, "raidz vdev expansion started",  tx,
 	    "%s vdev %llu new width %llu", spa_name(spa),
 	    (unsigned long long)raidvd->vdev_id,
 	    (unsigned long long)raidvd->vdev_children);
 }
 
 int
 vdev_raidz_load(vdev_t *vd)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	int err;
 
 	uint64_t state = DSS_NONE;
 	uint64_t start_time = 0;
 	uint64_t end_time = 0;
 	uint64_t bytes_copied = 0;
 
 	if (vd->vdev_top_zap != 0) {
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
 		    sizeof (state), 1, &state);
 		if (err != 0 && err != ENOENT)
 			return (err);
 
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
 		    sizeof (start_time), 1, &start_time);
 		if (err != 0 && err != ENOENT)
 			return (err);
 
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
 		    sizeof (end_time), 1, &end_time);
 		if (err != 0 && err != ENOENT)
 			return (err);
 
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
 		    sizeof (bytes_copied), 1, &bytes_copied);
 		if (err != 0 && err != ENOENT)
 			return (err);
 	}
 
 	/*
 	 * If we are in the middle of expansion, vre_state should have
 	 * already been set by vdev_raidz_init().
 	 */
 	EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
 	vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
 	vdrz->vn_vre.vre_start_time = start_time;
 	vdrz->vn_vre.vre_end_time = end_time;
 	vdrz->vn_vre.vre_bytes_copied = bytes_copied;
 
 	return (0);
 }
 
 int
 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
 {
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 
 	if (vre == NULL) {
 		/* no removal in progress; find most recent completed */
 		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
 			vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
 			if (vd->vdev_ops == &vdev_raidz_ops) {
 				vdev_raidz_t *vdrz = vd->vdev_tsd;
 
 				if (vdrz->vn_vre.vre_end_time != 0 &&
 				    (vre == NULL ||
 				    vdrz->vn_vre.vre_end_time >
 				    vre->vre_end_time)) {
 					vre = &vdrz->vn_vre;
 				}
 			}
 		}
 	}
 
 	if (vre == NULL) {
 		return (SET_ERROR(ENOENT));
 	}
 
 	pres->pres_state = vre->vre_state;
 	pres->pres_expanding_vdev = vre->vre_vdev_id;
 
 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
 
 	mutex_enter(&vre->vre_lock);
 	pres->pres_reflowed = vre->vre_bytes_copied;
 	for (int i = 0; i < TXG_SIZE; i++)
 		pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
 	mutex_exit(&vre->vre_lock);
 
 	pres->pres_start_time = vre->vre_start_time;
 	pres->pres_end_time = vre->vre_end_time;
 	pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
 
 	return (0);
 }
 
 /*
  * Initialize private RAIDZ specific fields from the nvlist.
  */
 static int
 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
 {
 	uint_t children;
 	nvlist_t **child;
 	int error = nvlist_lookup_nvlist_array(nv,
 	    ZPOOL_CONFIG_CHILDREN, &child, &children);
 	if (error != 0)
 		return (SET_ERROR(EINVAL));
 
 	uint64_t nparity;
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
 		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
 			return (SET_ERROR(EINVAL));
 
 		/*
 		 * Previous versions could only support 1 or 2 parity
 		 * device.
 		 */
 		if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
 			return (SET_ERROR(EINVAL));
 		else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
 			return (SET_ERROR(EINVAL));
 	} else {
 		/*
 		 * We require the parity to be specified for SPAs that
 		 * support multiple parity levels.
 		 */
 		if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
 			return (SET_ERROR(EINVAL));
 
 		/*
 		 * Otherwise, we default to 1 parity device for RAID-Z.
 		 */
 		nparity = 1;
 	}
 
 	vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
 	vdrz->vn_vre.vre_vdev_id = -1;
 	vdrz->vn_vre.vre_offset = UINT64_MAX;
 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
 	mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
 	zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
 	mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
 	    sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
 
 	vdrz->vd_physical_width = children;
 	vdrz->vd_nparity = nparity;
 
 	/* note, the ID does not exist when creating a pool */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
 	    &vdrz->vn_vre.vre_vdev_id);
 
 	boolean_t reflow_in_progress =
 	    nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
 	if (reflow_in_progress) {
 		spa->spa_raidz_expand = &vdrz->vn_vre;
 		vdrz->vn_vre.vre_state = DSS_SCANNING;
 	}
 
 	vdrz->vd_original_width = children;
 	uint64_t *txgs;
 	unsigned int txgs_size = 0;
 	error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
 	    &txgs, &txgs_size);
 	if (error == 0) {
 		for (int i = 0; i < txgs_size; i++) {
 			reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
 			re->re_txg = txgs[txgs_size - i - 1];
 			re->re_logical_width = vdrz->vd_physical_width - i;
 
 			if (reflow_in_progress)
 				re->re_logical_width--;
 
 			avl_add(&vdrz->vd_expand_txgs, re);
 		}
 
 		vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
 	}
 	if (reflow_in_progress) {
 		vdrz->vd_original_width--;
 		zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
 		    children, txgs_size);
 	}
 
 	*tsd = vdrz;
 
 	return (0);
 }
 
 static void
 vdev_raidz_fini(vdev_t *vd)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
 		vd->vdev_spa->spa_raidz_expand = NULL;
 	reflow_node_t *re;
 	void *cookie = NULL;
 	avl_tree_t *tree = &vdrz->vd_expand_txgs;
 	while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
 		kmem_free(re, sizeof (*re));
 	avl_destroy(&vdrz->vd_expand_txgs);
 	mutex_destroy(&vdrz->vd_expand_lock);
 	mutex_destroy(&vdrz->vn_vre.vre_lock);
 	cv_destroy(&vdrz->vn_vre.vre_cv);
 	zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
 	kmem_free(vdrz, sizeof (*vdrz));
 }
 
 /*
  * Add RAIDZ specific fields to the config nvlist.
  */
 static void
 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
 {
 	ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 
 	/*
 	 * Make sure someone hasn't managed to sneak a fancy new vdev
 	 * into a crufty old storage pool.
 	 */
 	ASSERT(vdrz->vd_nparity == 1 ||
 	    (vdrz->vd_nparity <= 2 &&
 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
 	    (vdrz->vd_nparity <= 3 &&
 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
 
 	/*
 	 * Note that we'll add these even on storage pools where they
 	 * aren't strictly required -- older software will just ignore
 	 * it.
 	 */
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
 
 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
 		fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
 	}
 
 	mutex_enter(&vdrz->vd_expand_lock);
 	if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
 		uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
 		uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
 		    KM_SLEEP);
 		uint64_t i = 0;
 
 		for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
 		    re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
 			txgs[i++] = re->re_txg;
 		}
 
 		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
 		    txgs, count);
 
 		kmem_free(txgs, sizeof (uint64_t) * count);
 	}
 	mutex_exit(&vdrz->vd_expand_lock);
 }
 
 static uint64_t
 vdev_raidz_nparity(vdev_t *vd)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	return (vdrz->vd_nparity);
 }
 
 static uint64_t
 vdev_raidz_ndisks(vdev_t *vd)
 {
 	return (vd->vdev_children);
 }
 
 vdev_ops_t vdev_raidz_ops = {
 	.vdev_op_init = vdev_raidz_init,
 	.vdev_op_fini = vdev_raidz_fini,
 	.vdev_op_open = vdev_raidz_open,
 	.vdev_op_close = vdev_raidz_close,
 	.vdev_op_asize = vdev_raidz_asize,
 	.vdev_op_min_asize = vdev_raidz_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_raidz_io_start,
 	.vdev_op_io_done = vdev_raidz_io_done,
 	.vdev_op_state_change = vdev_raidz_state_change,
 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_raidz_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = vdev_raidz_config_generate,
 	.vdev_op_nparity = vdev_raidz_nparity,
 	.vdev_op_ndisks = vdev_raidz_ndisks,
 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
 
 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
 	"For testing, pause RAIDZ expansion after reflowing this many bytes");
 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
 	"Max amount of concurrent i/o for RAIDZ expansion");
 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
 	"For expanded RAIDZ, aggregate reads that have more rows than this");
 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
 	"For expanded RAIDZ, automatically start a pool scrub when expansion "
 	"completes");
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index e1819448a98a..1970c5425854 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -1,2571 +1,2571 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/bpobj.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_dir.h>
 #include <sys/arc.h>
 #include <sys/zfeature.h>
 #include <sys/vdev_indirect_births.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/abd.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_trim.h>
 #include <sys/trace_zfs.h>
 
 /*
  * This file contains the necessary logic to remove vdevs from a
  * storage pool.  Currently, the only devices that can be removed
  * are log, cache, and spare devices; and top level vdevs from a pool
  * w/o raidz or mirrors.  (Note that members of a mirror can be removed
  * by the detach operation.)
  *
  * Log vdevs are removed by evacuating them and then turning the vdev
  * into a hole vdev while holding spa config locks.
  *
  * Top level vdevs are removed and converted into an indirect vdev via
  * a multi-step process:
  *
  *  - Disable allocations from this device (spa_vdev_remove_top).
  *
  *  - From a new thread (spa_vdev_remove_thread), copy data from
  *    the removing vdev to a different vdev.  The copy happens in open
  *    context (spa_vdev_copy_impl) and issues a sync task
  *    (vdev_mapping_sync) so the sync thread can update the partial
  *    indirect mappings in core and on disk.
  *
  *  - If a free happens during a removal, it is freed from the
  *    removing vdev, and if it has already been copied, from the new
  *    location as well (free_from_removing_vdev).
  *
  *  - After the removal is completed, the copy thread converts the vdev
  *    into an indirect vdev (vdev_remove_complete) before instructing
  *    the sync thread to destroy the space maps and finish the removal
  *    (spa_finish_removal).
  */
 
 typedef struct vdev_copy_arg {
 	metaslab_t	*vca_msp;
 	uint64_t	vca_outstanding_bytes;
 	uint64_t	vca_read_error_bytes;
 	uint64_t	vca_write_error_bytes;
 	kcondvar_t	vca_cv;
 	kmutex_t	vca_lock;
 } vdev_copy_arg_t;
 
 /*
  * The maximum amount of memory we can use for outstanding i/o while
  * doing a device removal.  This determines how much i/o we can have
  * in flight concurrently.
  */
 static const uint_t zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
 
 /*
  * The largest contiguous segment that we will attempt to allocate when
  * removing a device.  This can be no larger than SPA_MAXBLOCKSIZE.  If
  * there is a performance problem with attempting to allocate large blocks,
  * consider decreasing this.
  *
  * See also the accessor function spa_remove_max_segment().
  */
 uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
 
 /*
  * Ignore hard IO errors during device removal.  When set if a device
  * encounters hard IO error during the removal process the removal will
  * not be cancelled.  This can result in a normally recoverable block
  * becoming permanently damaged and is not recommended.
  */
 static int zfs_removal_ignore_errors = 0;
 
 /*
  * Allow a remap segment to span free chunks of at most this size. The main
  * impact of a larger span is that we will read and write larger, more
  * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
  * for iops.  The value here was chosen to align with
  * zfs_vdev_read_gap_limit, which is a similar concept when doing regular
  * reads (but there's no reason it has to be the same).
  *
  * Additionally, a higher span will have the following relatively minor
  * effects:
  *  - the mapping will be smaller, since one entry can cover more allocated
  *    segments
  *  - more of the fragmentation in the removing device will be preserved
  *  - we'll do larger allocations, which may fail and fall back on smaller
  *    allocations
  */
 uint_t vdev_removal_max_span = 32 * 1024;
 
 /*
  * This is used by the test suite so that it can ensure that certain
  * actions happen while in the middle of a removal.
  */
 int zfs_removal_suspend_progress = 0;
 
 #define	VDEV_REMOVAL_ZAP_OBJS	"lzap"
 
 static __attribute__((noreturn)) void spa_vdev_remove_thread(void *arg);
 static int spa_vdev_remove_cancel_impl(spa_t *spa);
 
 static void
 spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
 {
 	VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_REMOVING, sizeof (uint64_t),
 	    sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
 	    &spa->spa_removing_phys, tx));
 }
 
 static nvlist_t *
 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
 {
 	for (int i = 0; i < count; i++) {
 		uint64_t guid =
 		    fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID);
 
 		if (guid == target_guid)
 			return (nvpp[i]);
 	}
 
 	return (NULL);
 }
 
 static void
 vdev_activate(vdev_t *vd)
 {
 	metaslab_group_t *mg = vd->vdev_mg;
 	spa_t *spa = vd->vdev_spa;
 	uint64_t vdev_space = spa_deflate(spa) ?
 	    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
 
 	ASSERT(!vd->vdev_islog);
 	ASSERT(vd->vdev_noalloc);
 
 	metaslab_group_activate(mg);
 	metaslab_group_activate(vd->vdev_log_mg);
 
 	ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space);
 
 	spa->spa_nonallocating_dspace -= vdev_space;
 
 	vd->vdev_noalloc = B_FALSE;
 }
 
 static int
 vdev_passivate(vdev_t *vd, uint64_t *txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	int error;
 
 	ASSERT(!vd->vdev_noalloc);
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	metaslab_group_t *mg = vd->vdev_mg;
 	metaslab_class_t *normal = spa_normal_class(spa);
 	if (mg->mg_class == normal) {
 		/*
 		 * We must check that this is not the only allocating device in
 		 * the pool before passivating, otherwise we will not be able
 		 * to make progress because we can't allocate from any vdevs.
 		 */
 		boolean_t last = B_TRUE;
 		for (uint64_t id = 0; id < rvd->vdev_children; id++) {
 			vdev_t *cvd = rvd->vdev_child[id];
 
 			if (cvd == vd ||
 			    cvd->vdev_ops == &vdev_indirect_ops)
 				continue;
 
 			metaslab_class_t *mc = cvd->vdev_mg->mg_class;
 			if (mc != normal)
 				continue;
 
 			if (!cvd->vdev_noalloc) {
 				last = B_FALSE;
 				break;
 			}
 		}
 		if (last)
 			return (SET_ERROR(EINVAL));
 	}
 
 	metaslab_group_passivate(mg);
 	ASSERT(!vd->vdev_islog);
 	metaslab_group_passivate(vd->vdev_log_mg);
 
 	/*
 	 * Wait for the youngest allocations and frees to sync,
 	 * and then wait for the deferral of those frees to finish.
 	 */
 	spa_vdev_config_exit(spa, NULL,
 	    *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 
 	/*
 	 * We must ensure that no "stubby" log blocks are allocated
 	 * on the device to be removed.  These blocks could be
 	 * written at any time, including while we are in the middle
 	 * of copying them.
 	 */
 	error = spa_reset_logs(spa);
 
 	*txg = spa_vdev_config_enter(spa);
 
 	if (error != 0) {
 		metaslab_group_activate(mg);
 		ASSERT(!vd->vdev_islog);
 		if (vd->vdev_log_mg != NULL)
 			metaslab_group_activate(vd->vdev_log_mg);
 		return (error);
 	}
 
 	spa->spa_nonallocating_dspace += spa_deflate(spa) ?
 	    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
 	vd->vdev_noalloc = B_TRUE;
 
 	return (0);
 }
 
 /*
  * Turn off allocations for a top-level device from the pool.
  *
  * Turning off allocations for a top-level device can take a significant
  * amount of time. As a result we use the spa_vdev_config_[enter/exit]
  * functions which allow us to grab and release the spa_config_lock while
  * still holding the namespace lock. During each step the configuration
  * is synced out.
  */
 int
 spa_vdev_noalloc(spa_t *spa, uint64_t guid)
 {
 	vdev_t *vd;
 	uint64_t txg;
 	int error = 0;
 
 	ASSERT(!MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (vd == NULL)
 		error = SET_ERROR(ENOENT);
 	else if (vd->vdev_mg == NULL)
 		error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP);
 	else if (!vd->vdev_noalloc)
 		error = vdev_passivate(vd, &txg);
 
 	if (error == 0) {
 		vdev_dirty_leaves(vd, VDD_DTL, txg);
 		vdev_config_dirty(vd);
 	}
 
 	error = spa_vdev_exit(spa, NULL, txg, error);
 
 	return (error);
 }
 
 int
 spa_vdev_alloc(spa_t *spa, uint64_t guid)
 {
 	vdev_t *vd;
 	uint64_t txg;
 	int error = 0;
 
 	ASSERT(!MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (vd == NULL)
 		error = SET_ERROR(ENOENT);
 	else if (vd->vdev_mg == NULL)
 		error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP);
 	else if (!vd->vdev_removing)
 		vdev_activate(vd);
 
 	if (error == 0) {
 		vdev_dirty_leaves(vd, VDD_DTL, txg);
 		vdev_config_dirty(vd);
 	}
 
 	(void) spa_vdev_exit(spa, NULL, txg, error);
 
 	return (error);
 }
 
 static void
 spa_vdev_remove_aux(nvlist_t *config, const char *name, nvlist_t **dev,
     int count, nvlist_t *dev_to_remove)
 {
 	nvlist_t **newdev = NULL;
 
 	if (count > 1)
 		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
 
 	for (int i = 0, j = 0; i < count; i++) {
 		if (dev[i] == dev_to_remove)
 			continue;
 		VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
 	}
 
 	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
 	fnvlist_add_nvlist_array(config, name, (const nvlist_t * const *)newdev,
 	    count - 1);
 
 	for (int i = 0; i < count - 1; i++)
 		nvlist_free(newdev[i]);
 
 	if (count > 1)
 		kmem_free(newdev, (count - 1) * sizeof (void *));
 }
 
 static spa_vdev_removal_t *
 spa_vdev_removal_create(vdev_t *vd)
 {
 	spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP);
 	mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
 	svr->svr_allocd_segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
 	    NULL, 0, 0);
 	svr->svr_vdev_id = vd->vdev_id;
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		svr->svr_frees[i] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
 		    NULL, 0, 0);
 		list_create(&svr->svr_new_segments[i],
 		    sizeof (vdev_indirect_mapping_entry_t),
 		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
 	}
 
 	return (svr);
 }
 
 void
 spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
 {
 	for (int i = 0; i < TXG_SIZE; i++) {
 		ASSERT0(svr->svr_bytes_done[i]);
 		ASSERT0(svr->svr_max_offset_to_sync[i]);
 		zfs_range_tree_destroy(svr->svr_frees[i]);
 		list_destroy(&svr->svr_new_segments[i]);
 	}
 
 	zfs_range_tree_destroy(svr->svr_allocd_segs);
 	mutex_destroy(&svr->svr_lock);
 	cv_destroy(&svr->svr_cv);
 	kmem_free(svr, sizeof (*svr));
 }
 
 /*
  * This is called as a synctask in the txg in which we will mark this vdev
  * as removing (in the config stored in the MOS).
  *
  * It begins the evacuation of a toplevel vdev by:
  * - initializing the spa_removing_phys which tracks this removal
  * - computing the amount of space to remove for accounting purposes
  * - dirtying all dbufs in the spa_config_object
  * - creating the spa_vdev_removal
  * - starting the spa_vdev_remove_thread
  */
 static void
 vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
 {
 	int vdev_id = (uintptr_t)arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 	objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
 	spa_vdev_removal_t *svr = NULL;
 	uint64_t txg __maybe_unused = dmu_tx_get_txg(tx);
 
 	ASSERT0(vdev_get_nparity(vd));
 	svr = spa_vdev_removal_create(vd);
 
 	ASSERT(vd->vdev_removing);
 	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
 
 	spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
 		/*
 		 * By activating the OBSOLETE_COUNTS feature, we prevent
 		 * the pool from being downgraded and ensure that the
 		 * refcounts are precise.
 		 */
 		spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 		uint64_t one = 1;
 		VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
 		    &one, tx));
 		boolean_t are_precise __maybe_unused;
 		ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 		ASSERT3B(are_precise, ==, B_TRUE);
 	}
 
 	vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
 	vd->vdev_indirect_mapping =
 	    vdev_indirect_mapping_open(mos, vic->vic_mapping_object);
 	vic->vic_births_object = vdev_indirect_births_alloc(mos, tx);
 	vd->vdev_indirect_births =
 	    vdev_indirect_births_open(mos, vic->vic_births_object);
 	spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id;
 	spa->spa_removing_phys.sr_start_time = gethrestime_sec();
 	spa->spa_removing_phys.sr_end_time = 0;
 	spa->spa_removing_phys.sr_state = DSS_SCANNING;
 	spa->spa_removing_phys.sr_to_copy = 0;
 	spa->spa_removing_phys.sr_copied = 0;
 
 	/*
 	 * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because
 	 * there may be space in the defer tree, which is free, but still
 	 * counted in vs_alloc.
 	 */
 	for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
 		metaslab_t *ms = vd->vdev_ms[i];
 		if (ms->ms_sm == NULL)
 			continue;
 
 		spa->spa_removing_phys.sr_to_copy +=
 		    metaslab_allocated_space(ms);
 
 		/*
 		 * Space which we are freeing this txg does not need to
 		 * be copied.
 		 */
 		spa->spa_removing_phys.sr_to_copy -=
 		    zfs_range_tree_space(ms->ms_freeing);
 
 		ASSERT0(zfs_range_tree_space(ms->ms_freed));
 		for (int t = 0; t < TXG_SIZE; t++)
 			ASSERT0(zfs_range_tree_space(ms->ms_allocating[t]));
 	}
 
 	/*
 	 * Sync tasks are called before metaslab_sync(), so there should
 	 * be no already-synced metaslabs in the TXG_CLEAN list.
 	 */
 	ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL);
 
 	spa_sync_removing_state(spa, tx);
 
 	/*
 	 * All blocks that we need to read the most recent mapping must be
 	 * stored on concrete vdevs.  Therefore, we must dirty anything that
 	 * is read before spa_remove_init().  Specifically, the
 	 * spa_config_object.  (Note that although we already modified the
 	 * spa_config_object in spa_sync_removing_state, that may not have
 	 * modified all blocks of the object.)
 	 */
 	dmu_object_info_t doi;
 	VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi));
 	for (uint64_t offset = 0; offset < doi.doi_max_offset; ) {
 		dmu_buf_t *dbuf;
 		VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT,
 		    offset, FTAG, &dbuf, 0));
 		dmu_buf_will_dirty(dbuf, tx);
 		offset += dbuf->db_size;
 		dmu_buf_rele(dbuf, FTAG);
 	}
 
 	/*
 	 * Now that we've allocated the im_object, dirty the vdev to ensure
 	 * that the object gets written to the config on disk.
 	 */
 	vdev_config_dirty(vd);
 
 	zfs_dbgmsg("starting removal thread for vdev %llu (%px) in txg %llu "
 	    "im_obj=%llu", (u_longlong_t)vd->vdev_id, vd,
 	    (u_longlong_t)dmu_tx_get_txg(tx),
 	    (u_longlong_t)vic->vic_mapping_object);
 
 	spa_history_log_internal(spa, "vdev remove started", tx,
 	    "%s vdev %llu %s", spa_name(spa), (u_longlong_t)vd->vdev_id,
 	    (vd->vdev_path != NULL) ? vd->vdev_path : "-");
 	/*
 	 * Setting spa_vdev_removal causes subsequent frees to call
 	 * free_from_removing_vdev().  Note that we don't need any locking
 	 * because we are the sync thread, and metaslab_free_impl() is only
 	 * called from syncing context (potentially from a zio taskq thread,
 	 * but in any case only when there are outstanding free i/os, which
 	 * there are not).
 	 */
 	ASSERT3P(spa->spa_vdev_removal, ==, NULL);
 	spa->spa_vdev_removal = svr;
 	svr->svr_thread = thread_create(NULL, 0,
 	    spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
 }
 
 /*
  * When we are opening a pool, we must read the mapping for each
  * indirect vdev in order from most recently removed to least
  * recently removed.  We do this because the blocks for the mapping
  * of older indirect vdevs may be stored on more recently removed vdevs.
  * In order to read each indirect mapping object, we must have
  * initialized all more recently removed vdevs.
  */
 int
 spa_remove_init(spa_t *spa)
 {
 	int error;
 
 	error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_REMOVING, sizeof (uint64_t),
 	    sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
 	    &spa->spa_removing_phys);
 
 	if (error == ENOENT) {
 		spa->spa_removing_phys.sr_state = DSS_NONE;
 		spa->spa_removing_phys.sr_removing_vdev = -1;
 		spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
 		spa->spa_indirect_vdevs_loaded = B_TRUE;
 		return (0);
 	} else if (error != 0) {
 		return (error);
 	}
 
 	if (spa->spa_removing_phys.sr_state == DSS_SCANNING) {
 		/*
 		 * We are currently removing a vdev.  Create and
 		 * initialize a spa_vdev_removal_t from the bonus
 		 * buffer of the removing vdevs vdev_im_object, and
 		 * initialize its partial mapping.
 		 */
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 		vdev_t *vd = vdev_lookup_top(spa,
 		    spa->spa_removing_phys.sr_removing_vdev);
 
 		if (vd == NULL) {
 			spa_config_exit(spa, SCL_STATE, FTAG);
 			return (EINVAL);
 		}
 
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		ASSERT(vdev_is_concrete(vd));
 		spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
 		ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
 		ASSERT(vd->vdev_removing);
 
 		vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
 		    spa->spa_meta_objset, vic->vic_mapping_object);
 		vd->vdev_indirect_births = vdev_indirect_births_open(
 		    spa->spa_meta_objset, vic->vic_births_object);
 		spa_config_exit(spa, SCL_STATE, FTAG);
 
 		spa->spa_vdev_removal = svr;
 	}
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	uint64_t indirect_vdev_id =
 	    spa->spa_removing_phys.sr_prev_indirect_vdev;
 	while (indirect_vdev_id != UINT64_MAX) {
 		vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id);
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 		vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
 		    spa->spa_meta_objset, vic->vic_mapping_object);
 		vd->vdev_indirect_births = vdev_indirect_births_open(
 		    spa->spa_meta_objset, vic->vic_births_object);
 
 		indirect_vdev_id = vic->vic_prev_indirect_vdev;
 	}
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	/*
 	 * Now that we've loaded all the indirect mappings, we can allow
 	 * reads from other blocks (e.g. via predictive prefetch).
 	 */
 	spa->spa_indirect_vdevs_loaded = B_TRUE;
 	return (0);
 }
 
 void
 spa_restart_removal(spa_t *spa)
 {
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 
 	if (svr == NULL)
 		return;
 
 	/*
 	 * In general when this function is called there is no
 	 * removal thread running. The only scenario where this
 	 * is not true is during spa_import() where this function
 	 * is called twice [once from spa_import_impl() and
 	 * spa_async_resume()]. Thus, in the scenario where we
 	 * import a pool that has an ongoing removal we don't
 	 * want to spawn a second thread.
 	 */
 	if (svr->svr_thread != NULL)
 		return;
 
 	if (!spa_writeable(spa))
 		return;
 
 	zfs_dbgmsg("restarting removal of %llu",
 	    (u_longlong_t)svr->svr_vdev_id);
 	svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
 	    0, &p0, TS_RUN, minclsyspri);
 }
 
 /*
  * Process freeing from a device which is in the middle of being removed.
  * We must handle this carefully so that we attempt to copy freed data,
  * and we correctly free already-copied data.
  */
 void
 free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
 {
 	spa_t *spa = vd->vdev_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	uint64_t txg = spa_syncing_txg(spa);
 	uint64_t max_offset_yet = 0;
 
 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
 	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
 	    vdev_indirect_mapping_object(vim));
 	ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
 
 	mutex_enter(&svr->svr_lock);
 
 	/*
 	 * Remove the segment from the removing vdev's spacemap.  This
 	 * ensures that we will not attempt to copy this space (if the
 	 * removal thread has not yet visited it), and also ensures
 	 * that we know what is actually allocated on the new vdevs
 	 * (needed if we cancel the removal).
 	 *
 	 * Note: we must do the metaslab_free_concrete() with the svr_lock
 	 * held, so that the remove_thread can not load this metaslab and then
 	 * visit this offset between the time that we metaslab_free_concrete()
 	 * and when we check to see if it has been visited.
 	 *
 	 * Note: The checkpoint flag is set to false as having/taking
 	 * a checkpoint and removing a device can't happen at the same
 	 * time.
 	 */
 	ASSERT(!spa_has_checkpoint(spa));
 	metaslab_free_concrete(vd, offset, size, B_FALSE);
 
 	uint64_t synced_size = 0;
 	uint64_t synced_offset = 0;
 	uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim);
 	if (offset < max_offset_synced) {
 		/*
 		 * The mapping for this offset is already on disk.
 		 * Free from the new location.
 		 *
 		 * Note that we use svr_max_synced_offset because it is
 		 * updated atomically with respect to the in-core mapping.
 		 * By contrast, vim_max_offset is not.
 		 *
 		 * This block may be split between a synced entry and an
 		 * in-flight or unvisited entry.  Only process the synced
 		 * portion of it here.
 		 */
 		synced_size = MIN(size, max_offset_synced - offset);
 		synced_offset = offset;
 
 		ASSERT3U(max_offset_yet, <=, max_offset_synced);
 		max_offset_yet = max_offset_synced;
 
 		DTRACE_PROBE3(remove__free__synced,
 		    spa_t *, spa,
 		    uint64_t, offset,
 		    uint64_t, synced_size);
 
 		size -= synced_size;
 		offset += synced_size;
 	}
 
 	/*
 	 * Look at all in-flight txgs starting from the currently syncing one
 	 * and see if a section of this free is being copied. By starting from
 	 * this txg and iterating forward, we might find that this region
 	 * was copied in two different txgs and handle it appropriately.
 	 */
 	for (int i = 0; i < TXG_CONCURRENT_STATES; i++) {
 		int txgoff = (txg + i) & TXG_MASK;
 		if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) {
 			/*
 			 * The mapping for this offset is in flight, and
 			 * will be synced in txg+i.
 			 */
 			uint64_t inflight_size = MIN(size,
 			    svr->svr_max_offset_to_sync[txgoff] - offset);
 
 			DTRACE_PROBE4(remove__free__inflight,
 			    spa_t *, spa,
 			    uint64_t, offset,
 			    uint64_t, inflight_size,
 			    uint64_t, txg + i);
 
 			/*
 			 * We copy data in order of increasing offset.
 			 * Therefore the max_offset_to_sync[] must increase
 			 * (or be zero, indicating that nothing is being
 			 * copied in that txg).
 			 */
 			if (svr->svr_max_offset_to_sync[txgoff] != 0) {
 				ASSERT3U(svr->svr_max_offset_to_sync[txgoff],
 				    >=, max_offset_yet);
 				max_offset_yet =
 				    svr->svr_max_offset_to_sync[txgoff];
 			}
 
 			/*
 			 * We've already committed to copying this segment:
 			 * we have allocated space elsewhere in the pool for
 			 * it and have an IO outstanding to copy the data. We
 			 * cannot free the space before the copy has
 			 * completed, or else the copy IO might overwrite any
 			 * new data. To free that space, we record the
 			 * segment in the appropriate svr_frees tree and free
 			 * the mapped space later, in the txg where we have
 			 * completed the copy and synced the mapping (see
 			 * vdev_mapping_sync).
 			 */
 			zfs_range_tree_add(svr->svr_frees[txgoff],
 			    offset, inflight_size);
 			size -= inflight_size;
 			offset += inflight_size;
 
 			/*
 			 * This space is already accounted for as being
 			 * done, because it is being copied in txg+i.
 			 * However, if i!=0, then it is being copied in
 			 * a future txg.  If we crash after this txg
 			 * syncs but before txg+i syncs, then the space
 			 * will be free.  Therefore we must account
 			 * for the space being done in *this* txg
 			 * (when it is freed) rather than the future txg
 			 * (when it will be copied).
 			 */
 			ASSERT3U(svr->svr_bytes_done[txgoff], >=,
 			    inflight_size);
 			svr->svr_bytes_done[txgoff] -= inflight_size;
 			svr->svr_bytes_done[txg & TXG_MASK] += inflight_size;
 		}
 	}
 	ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]);
 
 	if (size > 0) {
 		/*
 		 * The copy thread has not yet visited this offset.  Ensure
 		 * that it doesn't.
 		 */
 
 		DTRACE_PROBE3(remove__free__unvisited,
 		    spa_t *, spa,
 		    uint64_t, offset,
 		    uint64_t, size);
 
 		if (svr->svr_allocd_segs != NULL)
 			zfs_range_tree_clear(svr->svr_allocd_segs, offset,
 			    size);
 
 		/*
 		 * Since we now do not need to copy this data, for
 		 * accounting purposes we have done our job and can count
 		 * it as completed.
 		 */
 		svr->svr_bytes_done[txg & TXG_MASK] += size;
 	}
 	mutex_exit(&svr->svr_lock);
 
 	/*
 	 * Now that we have dropped svr_lock, process the synced portion
 	 * of this free.
 	 */
 	if (synced_size > 0) {
 		vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
 
 		/*
 		 * Note: this can only be called from syncing context,
 		 * and the vdev_indirect_mapping is only changed from the
 		 * sync thread, so we don't need svr_lock while doing
 		 * metaslab_free_impl_cb.
 		 */
 		boolean_t checkpoint = B_FALSE;
 		vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
 		    metaslab_free_impl_cb, &checkpoint);
 	}
 }
 
 /*
  * Stop an active removal and update the spa_removing phys.
  */
 static void
 spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa));
 
 	/* Ensure the removal thread has completed before we free the svr. */
 	spa_vdev_remove_suspend(spa);
 
 	ASSERT(state == DSS_FINISHED || state == DSS_CANCELED);
 
 	if (state == DSS_FINISHED) {
 		spa_removing_phys_t *srp = &spa->spa_removing_phys;
 		vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		if (srp->sr_prev_indirect_vdev != -1) {
 			vdev_t *pvd;
 			pvd = vdev_lookup_top(spa,
 			    srp->sr_prev_indirect_vdev);
 			ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops);
 		}
 
 		vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev;
 		srp->sr_prev_indirect_vdev = vd->vdev_id;
 	}
 	spa->spa_removing_phys.sr_state = state;
 	spa->spa_removing_phys.sr_end_time = gethrestime_sec();
 
 	spa->spa_vdev_removal = NULL;
 	spa_vdev_removal_destroy(svr);
 
 	spa_sync_removing_state(spa, tx);
 	spa_notify_waiters(spa);
 
 	vdev_config_dirty(spa->spa_root_vdev);
 }
 
 static void
 free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
 {
 	vdev_t *vd = arg;
 	vdev_indirect_mark_obsolete(vd, offset, size);
 	boolean_t checkpoint = B_FALSE;
 	vdev_indirect_ops.vdev_op_remap(vd, offset, size,
 	    metaslab_free_impl_cb, &checkpoint);
 }
 
 /*
  * On behalf of the removal thread, syncs an incremental bit more of
  * the indirect mapping to disk and updates the in-memory mapping.
  * Called as a sync task in every txg that the removal thread makes progress.
  */
 static void
 vdev_mapping_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config;
 	uint64_t txg = dmu_tx_get_txg(tx);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 	ASSERT(vic->vic_mapping_object != 0);
 	ASSERT3U(txg, ==, spa_syncing_txg(spa));
 
 	vdev_indirect_mapping_add_entries(vim,
 	    &svr->svr_new_segments[txg & TXG_MASK], tx);
 	vdev_indirect_births_add_entry(vd->vdev_indirect_births,
 	    vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx);
 
 	/*
 	 * Free the copied data for anything that was freed while the
 	 * mapping entries were in flight.
 	 */
 	mutex_enter(&svr->svr_lock);
 	zfs_range_tree_vacate(svr->svr_frees[txg & TXG_MASK],
 	    free_mapped_segment_cb, vd);
 	ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=,
 	    vdev_indirect_mapping_max_offset(vim));
 	svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0;
 	mutex_exit(&svr->svr_lock);
 
 	spa_sync_removing_state(spa, tx);
 }
 
 typedef struct vdev_copy_segment_arg {
 	spa_t *vcsa_spa;
 	dva_t *vcsa_dest_dva;
 	uint64_t vcsa_txg;
 	zfs_range_tree_t *vcsa_obsolete_segs;
 } vdev_copy_segment_arg_t;
 
 static void
 unalloc_seg(void *arg, uint64_t start, uint64_t size)
 {
 	vdev_copy_segment_arg_t *vcsa = arg;
 	spa_t *spa = vcsa->vcsa_spa;
 	blkptr_t bp = { { { {0} } } };
 
 	BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL);
 	BP_SET_LSIZE(&bp, size);
 	BP_SET_PSIZE(&bp, size);
 	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
 	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF);
 	BP_SET_TYPE(&bp, DMU_OT_NONE);
 	BP_SET_LEVEL(&bp, 0);
 	BP_SET_DEDUP(&bp, 0);
 	BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
 
 	DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva));
 	DVA_SET_OFFSET(&bp.blk_dva[0],
 	    DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start);
 	DVA_SET_ASIZE(&bp.blk_dva[0], size);
 
 	zio_free(spa, vcsa->vcsa_txg, &bp);
 }
 
 /*
  * All reads and writes associated with a call to spa_vdev_copy_segment()
  * are done.
  */
 static void
 spa_vdev_copy_segment_done(zio_t *zio)
 {
 	vdev_copy_segment_arg_t *vcsa = zio->io_private;
 
 	zfs_range_tree_vacate(vcsa->vcsa_obsolete_segs,
 	    unalloc_seg, vcsa);
 	zfs_range_tree_destroy(vcsa->vcsa_obsolete_segs);
 	kmem_free(vcsa, sizeof (*vcsa));
 
 	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
 }
 
 /*
  * The write of the new location is done.
  */
 static void
 spa_vdev_copy_segment_write_done(zio_t *zio)
 {
 	vdev_copy_arg_t *vca = zio->io_private;
 
 	abd_free(zio->io_abd);
 
 	mutex_enter(&vca->vca_lock);
 	vca->vca_outstanding_bytes -= zio->io_size;
 
 	if (zio->io_error != 0)
 		vca->vca_write_error_bytes += zio->io_size;
 
 	cv_signal(&vca->vca_cv);
 	mutex_exit(&vca->vca_lock);
 }
 
 /*
  * The read of the old location is done.  The parent zio is the write to
  * the new location.  Allow it to start.
  */
 static void
 spa_vdev_copy_segment_read_done(zio_t *zio)
 {
 	vdev_copy_arg_t *vca = zio->io_private;
 
 	if (zio->io_error != 0) {
 		mutex_enter(&vca->vca_lock);
 		vca->vca_read_error_bytes += zio->io_size;
 		mutex_exit(&vca->vca_lock);
 	}
 
 	zio_nowait(zio_unique_parent(zio));
 }
 
 /*
  * If the old and new vdevs are mirrors, we will read both sides of the old
  * mirror, and write each copy to the corresponding side of the new mirror.
  * If the old and new vdevs have a different number of children, we will do
  * this as best as possible.  Since we aren't verifying checksums, this
  * ensures that as long as there's a good copy of the data, we'll have a
  * good copy after the removal, even if there's silent damage to one side
  * of the mirror. If we're removing a mirror that has some silent damage,
  * we'll have exactly the same damage in the new location (assuming that
  * the new location is also a mirror).
  *
  * We accomplish this by creating a tree of zio_t's, with as many writes as
  * there are "children" of the new vdev (a non-redundant vdev counts as one
  * child, a 2-way mirror has 2 children, etc). Each write has an associated
  * read from a child of the old vdev. Typically there will be the same
  * number of children of the old and new vdevs.  However, if there are more
  * children of the new vdev, some child(ren) of the old vdev will be issued
  * multiple reads.  If there are more children of the old vdev, some copies
  * will be dropped.
  *
  * For example, the tree of zio_t's for a 2-way mirror is:
  *
  *                            null
  *                           /    \
  *    write(new vdev, child 0)      write(new vdev, child 1)
  *      |                             |
  *    read(old vdev, child 0)       read(old vdev, child 1)
  *
  * Child zio's complete before their parents complete.  However, zio's
  * created with zio_vdev_child_io() may be issued before their children
  * complete.  In this case we need to make sure that the children (reads)
  * complete before the parents (writes) are *issued*.  We do this by not
  * calling zio_nowait() on each write until its corresponding read has
  * completed.
  *
  * The spa_config_lock must be held while zio's created by
  * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
  * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
  * zio is needed to release the spa_config_lock after all the reads and
  * writes complete. (Note that we can't grab the config lock for each read,
  * because it is not reentrant - we could deadlock with a thread waiting
  * for a write lock.)
  */
 static void
 spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
     vdev_t *source_vd, uint64_t source_offset,
     vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
 {
 	ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
 
 	/*
 	 * If the destination child in unwritable then there is no point
 	 * in issuing the source reads which cannot be written.
 	 */
 	if (!vdev_writeable(dest_child_vd))
 		return;
 
 	mutex_enter(&vca->vca_lock);
 	vca->vca_outstanding_bytes += size;
 	mutex_exit(&vca->vca_lock);
 
 	abd_t *abd = abd_alloc_for_io(size, B_FALSE);
 
 	vdev_t *source_child_vd = NULL;
 	if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
 		/*
 		 * Source and dest are both mirrors.  Copy from the same
 		 * child id as we are copying to (wrapping around if there
 		 * are more dest children than source children).  If the
 		 * preferred source child is unreadable select another.
 		 */
 		for (int i = 0; i < source_vd->vdev_children; i++) {
 			source_child_vd = source_vd->vdev_child[
 			    (dest_id + i) % source_vd->vdev_children];
 			if (vdev_readable(source_child_vd))
 				break;
 		}
 	} else {
 		source_child_vd = source_vd;
 	}
 
 	/*
 	 * There should always be at least one readable source child or
 	 * the pool would be in a suspended state.  Somehow selecting an
 	 * unreadable child would result in IO errors, the removal process
 	 * being cancelled, and the pool reverting to its pre-removal state.
 	 */
 	ASSERT3P(source_child_vd, !=, NULL);
 
 	zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
 	    dest_child_vd, dest_offset, abd, size,
 	    ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
 	    ZIO_FLAG_CANFAIL,
 	    spa_vdev_copy_segment_write_done, vca);
 
 	zio_nowait(zio_vdev_child_io(write_zio, NULL,
 	    source_child_vd, source_offset, abd, size,
 	    ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
 	    ZIO_FLAG_CANFAIL,
 	    spa_vdev_copy_segment_read_done, vca));
 }
 
 /*
  * Allocate a new location for this segment, and create the zio_t's to
  * read from the old location and write to the new location.
  */
 static int
 spa_vdev_copy_segment(vdev_t *vd, zfs_range_tree_t *segs,
     uint64_t maxalloc, uint64_t txg,
     vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
 {
 	metaslab_group_t *mg = vd->vdev_mg;
 	spa_t *spa = vd->vdev_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_indirect_mapping_entry_t *entry;
 	dva_t dst = {{ 0 }};
 	uint64_t start = zfs_range_tree_min(segs);
 	ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift));
 
 	ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
 	ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift));
 
 	uint64_t size = zfs_range_tree_span(segs);
 	if (zfs_range_tree_span(segs) > maxalloc) {
 		/*
 		 * We can't allocate all the segments.  Prefer to end
 		 * the allocation at the end of a segment, thus avoiding
 		 * additional split blocks.
 		 */
-		range_seg_max_t search;
+		zfs_range_seg_max_t search;
 		zfs_btree_index_t where;
 		zfs_rs_set_start(&search, segs, start + maxalloc);
 		zfs_rs_set_end(&search, segs, start + maxalloc);
 		(void) zfs_btree_find(&segs->rt_root, &search, &where);
 		zfs_range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where,
 		    &where);
 		if (rs != NULL) {
 			size = zfs_rs_get_end(rs, segs) - start;
 		} else {
 			/*
 			 * There are no segments that end before maxalloc.
 			 * I.e. the first segment is larger than maxalloc,
 			 * so we must split it.
 			 */
 			size = maxalloc;
 		}
 	}
 	ASSERT3U(size, <=, maxalloc);
 	ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift));
 
 	/*
 	 * An allocation class might not have any remaining vdevs or space
 	 */
 	metaslab_class_t *mc = mg->mg_class;
 	if (mc->mc_groups == 0)
 		mc = spa_normal_class(spa);
 	int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg,
 	    METASLAB_DONT_THROTTLE, zal, 0);
 	if (error == ENOSPC && mc != spa_normal_class(spa)) {
 		error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
 		    &dst, 0, NULL, txg, METASLAB_DONT_THROTTLE, zal, 0);
 	}
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Determine the ranges that are not actually needed.  Offsets are
 	 * relative to the start of the range to be copied (i.e. relative to the
 	 * local variable "start").
 	 */
 	zfs_range_tree_t *obsolete_segs = zfs_range_tree_create(NULL,
 	    ZFS_RANGE_SEG64, NULL, 0, 0);
 
 	zfs_btree_index_t where;
 	zfs_range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where);
 	ASSERT3U(zfs_rs_get_start(rs, segs), ==, start);
 	uint64_t prev_seg_end = zfs_rs_get_end(rs, segs);
 	while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) {
 		if (zfs_rs_get_start(rs, segs) >= start + size) {
 			break;
 		} else {
 			zfs_range_tree_add(obsolete_segs,
 			    prev_seg_end - start,
 			    zfs_rs_get_start(rs, segs) - prev_seg_end);
 		}
 		prev_seg_end = zfs_rs_get_end(rs, segs);
 	}
 	/* We don't end in the middle of an obsolete range */
 	ASSERT3U(start + size, <=, prev_seg_end);
 
 	zfs_range_tree_clear(segs, start, size);
 
 	/*
 	 * We can't have any padding of the allocated size, otherwise we will
 	 * misunderstand what's allocated, and the size of the mapping. We
 	 * prevent padding by ensuring that all devices in the pool have the
 	 * same ashift, and the allocation size is a multiple of the ashift.
 	 */
 	VERIFY3U(DVA_GET_ASIZE(&dst), ==, size);
 
 	entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
 	DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
 	entry->vime_mapping.vimep_dst = dst;
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
 		entry->vime_obsolete_count =
 		    zfs_range_tree_space(obsolete_segs);
 	}
 
 	vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP);
 	vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
 	vcsa->vcsa_obsolete_segs = obsolete_segs;
 	vcsa->vcsa_spa = spa;
 	vcsa->vcsa_txg = txg;
 
 	/*
 	 * See comment before spa_vdev_copy_one_child().
 	 */
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 	zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
 	    spa_vdev_copy_segment_done, vcsa, 0);
 	vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
 	if (dest_vd->vdev_ops == &vdev_mirror_ops) {
 		for (int i = 0; i < dest_vd->vdev_children; i++) {
 			vdev_t *child = dest_vd->vdev_child[i];
 			spa_vdev_copy_one_child(vca, nzio, vd, start,
 			    child, DVA_GET_OFFSET(&dst), i, size);
 		}
 	} else {
 		spa_vdev_copy_one_child(vca, nzio, vd, start,
 		    dest_vd, DVA_GET_OFFSET(&dst), -1, size);
 	}
 	zio_nowait(nzio);
 
 	list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
 	ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
 	vdev_dirty(vd, 0, NULL, txg);
 
 	return (0);
 }
 
 /*
  * Complete the removal of a toplevel vdev. This is called as a
  * synctask in the same txg that we will sync out the new config (to the
  * MOS object) which indicates that this vdev is indirect.
  */
 static void
 vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		ASSERT0(svr->svr_bytes_done[i]);
 	}
 
 	ASSERT3U(spa->spa_removing_phys.sr_copied, ==,
 	    spa->spa_removing_phys.sr_to_copy);
 
 	vdev_destroy_spacemaps(vd, tx);
 
 	/* destroy leaf zaps, if any */
 	ASSERT3P(svr->svr_zaplist, !=, NULL);
 	for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL);
 	    pair != NULL;
 	    pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) {
 		vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx);
 	}
 	fnvlist_free(svr->svr_zaplist);
 
 	spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx);
 	/* vd->vdev_path is not available here */
 	spa_history_log_internal(spa, "vdev remove completed",  tx,
 	    "%s vdev %llu", spa_name(spa), (u_longlong_t)vd->vdev_id);
 }
 
 static void
 vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
 {
 	ASSERT3P(zlist, !=, NULL);
 	ASSERT0(vdev_get_nparity(vd));
 
 	if (vd->vdev_leaf_zap != 0) {
 		char zkey[32];
 		(void) snprintf(zkey, sizeof (zkey), "%s-%llu",
 		    VDEV_REMOVAL_ZAP_OBJS, (u_longlong_t)vd->vdev_leaf_zap);
 		fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap);
 	}
 
 	for (uint64_t id = 0; id < vd->vdev_children; id++) {
 		vdev_remove_enlist_zaps(vd->vdev_child[id], zlist);
 	}
 }
 
 static void
 vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
 {
 	vdev_t *ivd;
 	dmu_tx_t *tx;
 	spa_t *spa = vd->vdev_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 
 	/*
 	 * First, build a list of leaf zaps to be destroyed.
 	 * This is passed to the sync context thread,
 	 * which does the actual unlinking.
 	 */
 	svr->svr_zaplist = fnvlist_alloc();
 	vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
 
 	ivd = vdev_add_parent(vd, &vdev_indirect_ops);
 	ivd->vdev_removing = 0;
 
 	vd->vdev_leaf_zap = 0;
 
 	vdev_remove_child(ivd, vd);
 	vdev_compact_children(ivd);
 
 	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
 	mutex_enter(&svr->svr_lock);
 	svr->svr_thread = NULL;
 	cv_broadcast(&svr->svr_cv);
 	mutex_exit(&svr->svr_lock);
 
 	/* After this, we can not use svr. */
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 	dsl_sync_task_nowait(spa->spa_dsl_pool,
 	    vdev_remove_complete_sync, svr, tx);
 	dmu_tx_commit(tx);
 }
 
 /*
  * Complete the removal of a toplevel vdev. This is called in open
  * context by the removal thread after we have copied all vdev's data.
  */
 static void
 vdev_remove_complete(spa_t *spa)
 {
 	uint64_t txg;
 
 	/*
 	 * Wait for any deferred frees to be synced before we call
 	 * vdev_metaslab_fini()
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 	txg = spa_vdev_enter(spa);
 	vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 	ASSERT3P(vd->vdev_trim_thread, ==, NULL);
 	ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
 	vdev_rebuild_stop_wait(vd);
 	ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
 	uint64_t vdev_space = spa_deflate(spa) ?
 	    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
 
 	sysevent_t *ev = spa_event_create(spa, vd, NULL,
 	    ESC_ZFS_VDEV_REMOVE_DEV);
 
 	zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)txg);
 
 	ASSERT3U(0, !=, vdev_space);
 	ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space);
 
 	/* the vdev is no longer part of the dspace */
 	spa->spa_nonallocating_dspace -= vdev_space;
 
 	/*
 	 * Discard allocation state.
 	 */
 	if (vd->vdev_mg != NULL) {
 		vdev_metaslab_fini(vd);
 		metaslab_group_destroy(vd->vdev_mg);
 		vd->vdev_mg = NULL;
 	}
 	if (vd->vdev_log_mg != NULL) {
 		ASSERT0(vd->vdev_ms_count);
 		metaslab_group_destroy(vd->vdev_log_mg);
 		vd->vdev_log_mg = NULL;
 	}
 	ASSERT0(vd->vdev_stat.vs_space);
 	ASSERT0(vd->vdev_stat.vs_dspace);
 
 	vdev_remove_replace_with_indirect(vd, txg);
 
 	/*
 	 * We now release the locks, allowing spa_sync to run and finish the
 	 * removal via vdev_remove_complete_sync in syncing context.
 	 *
 	 * Note that we hold on to the vdev_t that has been replaced.  Since
 	 * it isn't part of the vdev tree any longer, it can't be concurrently
 	 * manipulated, even while we don't have the config lock.
 	 */
 	(void) spa_vdev_exit(spa, NULL, txg, 0);
 
 	/*
 	 * Top ZAP should have been transferred to the indirect vdev in
 	 * vdev_remove_replace_with_indirect.
 	 */
 	ASSERT0(vd->vdev_top_zap);
 
 	/*
 	 * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect.
 	 */
 	ASSERT0(vd->vdev_leaf_zap);
 
 	txg = spa_vdev_enter(spa);
 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 	/*
 	 * Request to update the config and the config cachefile.
 	 */
 	vdev_config_dirty(spa->spa_root_vdev);
 	(void) spa_vdev_exit(spa, vd, txg, 0);
 
 	if (ev != NULL)
 		spa_event_post(ev);
 }
 
 /*
  * Evacuates a segment of size at most max_alloc from the vdev
  * via repeated calls to spa_vdev_copy_segment. If an allocation
  * fails, the pool is probably too fragmented to handle such a
  * large size, so decrease max_alloc so that the caller will not try
  * this size again this txg.
  */
 static void
 spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
     uint64_t *max_alloc, dmu_tx_t *tx)
 {
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	mutex_enter(&svr->svr_lock);
 
 	/*
 	 * Determine how big of a chunk to copy.  We can allocate up
 	 * to max_alloc bytes, and we can span up to vdev_removal_max_span
 	 * bytes of unallocated space at a time.  "segs" will track the
 	 * allocated segments that we are copying.  We may also be copying
 	 * free segments (of up to vdev_removal_max_span bytes).
 	 */
 	zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
 	    NULL, 0, 0);
 	for (;;) {
 		zfs_range_tree_t *rt = svr->svr_allocd_segs;
 		zfs_range_seg_t *rs = zfs_range_tree_first(rt);
 
 		if (rs == NULL)
 			break;
 
 		uint64_t seg_length;
 
 		if (zfs_range_tree_is_empty(segs)) {
 			/* need to truncate the first seg based on max_alloc */
 			seg_length = MIN(zfs_rs_get_end(rs, rt) -
 			    zfs_rs_get_start(rs, rt), *max_alloc);
 		} else {
 			if (zfs_rs_get_start(rs, rt) - zfs_range_tree_max(segs)
 			    > vdev_removal_max_span) {
 				/*
 				 * Including this segment would cause us to
 				 * copy a larger unneeded chunk than is allowed.
 				 */
 				break;
 			} else if (zfs_rs_get_end(rs, rt) -
 			    zfs_range_tree_min(segs) > *max_alloc) {
 				/*
 				 * This additional segment would extend past
 				 * max_alloc. Rather than splitting this
 				 * segment, leave it for the next mapping.
 				 */
 				break;
 			} else {
 				seg_length = zfs_rs_get_end(rs, rt) -
 				    zfs_rs_get_start(rs, rt);
 			}
 		}
 
 		zfs_range_tree_add(segs, zfs_rs_get_start(rs, rt), seg_length);
 		zfs_range_tree_remove(svr->svr_allocd_segs,
 		    zfs_rs_get_start(rs, rt), seg_length);
 	}
 
 	if (zfs_range_tree_is_empty(segs)) {
 		mutex_exit(&svr->svr_lock);
 		zfs_range_tree_destroy(segs);
 		return;
 	}
 
 	if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
 		dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
 		    svr, tx);
 	}
 
 	svr->svr_max_offset_to_sync[txg & TXG_MASK] = zfs_range_tree_max(segs);
 
 	/*
 	 * Note: this is the amount of *allocated* space
 	 * that we are taking care of each txg.
 	 */
 	svr->svr_bytes_done[txg & TXG_MASK] += zfs_range_tree_space(segs);
 
 	mutex_exit(&svr->svr_lock);
 
 	zio_alloc_list_t zal;
 	metaslab_trace_init(&zal);
 	uint64_t thismax = SPA_MAXBLOCKSIZE;
 	while (!zfs_range_tree_is_empty(segs)) {
 		int error = spa_vdev_copy_segment(vd,
 		    segs, thismax, txg, vca, &zal);
 
 		if (error == ENOSPC) {
 			/*
 			 * Cut our segment in half, and don't try this
 			 * segment size again this txg.  Note that the
 			 * allocation size must be aligned to the highest
 			 * ashift in the pool, so that the allocation will
 			 * not be padded out to a multiple of the ashift,
 			 * which could cause us to think that this mapping
 			 * is larger than we intended.
 			 */
 			ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
 			ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
 			uint64_t attempted =
 			    MIN(zfs_range_tree_span(segs), thismax);
 			thismax = P2ROUNDUP(attempted / 2,
 			    1 << spa->spa_max_ashift);
 			/*
 			 * The minimum-size allocation can not fail.
 			 */
 			ASSERT3U(attempted, >, 1 << spa->spa_max_ashift);
 			*max_alloc = attempted - (1 << spa->spa_max_ashift);
 		} else {
 			ASSERT0(error);
 
 			/*
 			 * We've performed an allocation, so reset the
 			 * alloc trace list.
 			 */
 			metaslab_trace_fini(&zal);
 			metaslab_trace_init(&zal);
 		}
 	}
 	metaslab_trace_fini(&zal);
 	zfs_range_tree_destroy(segs);
 }
 
 /*
  * The size of each removal mapping is limited by the tunable
  * zfs_remove_max_segment, but we must adjust this to be a multiple of the
  * pool's ashift, so that we don't try to split individual sectors regardless
  * of the tunable value.  (Note that device removal requires that all devices
  * have the same ashift, so there's no difference between spa_min_ashift and
  * spa_max_ashift.) The raw tunable should not be used elsewhere.
  */
 uint64_t
 spa_remove_max_segment(spa_t *spa)
 {
 	return (P2ROUNDUP(zfs_remove_max_segment, 1 << spa->spa_max_ashift));
 }
 
 /*
  * The removal thread operates in open context.  It iterates over all
  * allocated space in the vdev, by loading each metaslab's spacemap.
  * For each contiguous segment of allocated space (capping the segment
  * size at SPA_MAXBLOCKSIZE), we:
  *    - Allocate space for it on another vdev.
  *    - Create a new mapping from the old location to the new location
  *      (as a record in svr_new_segments).
  *    - Initiate a physical read zio to get the data off the removing disk.
  *    - In the read zio's done callback, initiate a physical write zio to
  *      write it to the new vdev.
  * Note that all of this will take effect when a particular TXG syncs.
  * The sync thread ensures that all the phys reads and writes for the syncing
  * TXG have completed (see spa_txg_zio) and writes the new mappings to disk
  * (see vdev_mapping_sync()).
  */
 static __attribute__((noreturn)) void
 spa_vdev_remove_thread(void *arg)
 {
 	spa_t *spa = arg;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_copy_arg_t vca;
 	uint64_t max_alloc = spa_remove_max_segment(spa);
 	uint64_t last_txg = 0;
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
 
 	ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT(vd->vdev_removing);
 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
 	ASSERT(vim != NULL);
 
 	mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
 	vca.vca_outstanding_bytes = 0;
 	vca.vca_read_error_bytes = 0;
 	vca.vca_write_error_bytes = 0;
 
 	mutex_enter(&svr->svr_lock);
 
 	/*
 	 * Start from vim_max_offset so we pick up where we left off
 	 * if we are restarting the removal after opening the pool.
 	 */
 	uint64_t msi;
 	for (msi = start_offset >> vd->vdev_ms_shift;
 	    msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) {
 		metaslab_t *msp = vd->vdev_ms[msi];
 		ASSERT3U(msi, <=, vd->vdev_ms_count);
 
 		ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs));
 
 		mutex_enter(&msp->ms_sync_lock);
 		mutex_enter(&msp->ms_lock);
 
 		/*
 		 * Assert nothing in flight -- ms_*tree is empty.
 		 */
 		for (int i = 0; i < TXG_SIZE; i++) {
 			ASSERT0(zfs_range_tree_space(msp->ms_allocating[i]));
 		}
 
 		/*
 		 * If the metaslab has ever been allocated from (ms_sm!=NULL),
 		 * read the allocated segments from the space map object
 		 * into svr_allocd_segs. Since we do this while holding
 		 * svr_lock and ms_sync_lock, concurrent frees (which
 		 * would have modified the space map) will wait for us
 		 * to finish loading the spacemap, and then take the
 		 * appropriate action (see free_from_removing_vdev()).
 		 */
 		if (msp->ms_sm != NULL) {
 			VERIFY0(space_map_load(msp->ms_sm,
 			    svr->svr_allocd_segs, SM_ALLOC));
 
 			zfs_range_tree_walk(msp->ms_unflushed_allocs,
 			    zfs_range_tree_add, svr->svr_allocd_segs);
 			zfs_range_tree_walk(msp->ms_unflushed_frees,
 			    zfs_range_tree_remove, svr->svr_allocd_segs);
 			zfs_range_tree_walk(msp->ms_freeing,
 			    zfs_range_tree_remove, svr->svr_allocd_segs);
 
 			/*
 			 * When we are resuming from a paused removal (i.e.
 			 * when importing a pool with a removal in progress),
 			 * discard any state that we have already processed.
 			 */
 			zfs_range_tree_clear(svr->svr_allocd_segs, 0,
 			    start_offset);
 		}
 		mutex_exit(&msp->ms_lock);
 		mutex_exit(&msp->ms_sync_lock);
 
 		vca.vca_msp = msp;
 		zfs_dbgmsg("copying %llu segments for metaslab %llu",
 		    (u_longlong_t)zfs_btree_numnodes(
 		    &svr->svr_allocd_segs->rt_root),
 		    (u_longlong_t)msp->ms_id);
 
 		while (!svr->svr_thread_exit &&
 		    !zfs_range_tree_is_empty(svr->svr_allocd_segs)) {
 
 			mutex_exit(&svr->svr_lock);
 
 			/*
 			 * We need to periodically drop the config lock so that
 			 * writers can get in.  Additionally, we can't wait
 			 * for a txg to sync while holding a config lock
 			 * (since a waiting writer could cause a 3-way deadlock
 			 * with the sync thread, which also gets a config
 			 * lock for reader).  So we can't hold the config lock
 			 * while calling dmu_tx_assign().
 			 */
 			spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 			/*
 			 * This delay will pause the removal around the point
 			 * specified by zfs_removal_suspend_progress. We do this
 			 * solely from the test suite or during debugging.
 			 */
 			while (zfs_removal_suspend_progress &&
 			    !svr->svr_thread_exit)
 				delay(hz);
 
 			mutex_enter(&vca.vca_lock);
 			while (vca.vca_outstanding_bytes >
 			    zfs_remove_max_copy_bytes) {
 				cv_wait(&vca.vca_cv, &vca.vca_lock);
 			}
 			mutex_exit(&vca.vca_lock);
 
 			dmu_tx_t *tx =
 			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 
 			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 			uint64_t txg = dmu_tx_get_txg(tx);
 
 			/*
 			 * Reacquire the vdev_config lock.  The vdev_t
 			 * that we're removing may have changed, e.g. due
 			 * to a vdev_attach or vdev_detach.
 			 */
 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 			vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 
 			if (txg != last_txg)
 				max_alloc = spa_remove_max_segment(spa);
 			last_txg = txg;
 
 			spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
 
 			dmu_tx_commit(tx);
 			mutex_enter(&svr->svr_lock);
 		}
 
 		mutex_enter(&vca.vca_lock);
 		if (zfs_removal_ignore_errors == 0 &&
 		    (vca.vca_read_error_bytes > 0 ||
 		    vca.vca_write_error_bytes > 0)) {
 			svr->svr_thread_exit = B_TRUE;
 		}
 		mutex_exit(&vca.vca_lock);
 	}
 
 	mutex_exit(&svr->svr_lock);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	/*
 	 * Wait for all copies to finish before cleaning up the vca.
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 	ASSERT0(vca.vca_outstanding_bytes);
 
 	mutex_destroy(&vca.vca_lock);
 	cv_destroy(&vca.vca_cv);
 
 	if (svr->svr_thread_exit) {
 		mutex_enter(&svr->svr_lock);
 		zfs_range_tree_vacate(svr->svr_allocd_segs, NULL, NULL);
 		svr->svr_thread = NULL;
 		cv_broadcast(&svr->svr_cv);
 		mutex_exit(&svr->svr_lock);
 
 		/*
 		 * During the removal process an unrecoverable read or write
 		 * error was encountered.  The removal process must be
 		 * cancelled or this damage may become permanent.
 		 */
 		if (zfs_removal_ignore_errors == 0 &&
 		    (vca.vca_read_error_bytes > 0 ||
 		    vca.vca_write_error_bytes > 0)) {
 			zfs_dbgmsg("canceling removal due to IO errors: "
 			    "[read_error_bytes=%llu] [write_error_bytes=%llu]",
 			    (u_longlong_t)vca.vca_read_error_bytes,
 			    (u_longlong_t)vca.vca_write_error_bytes);
 			spa_vdev_remove_cancel_impl(spa);
 		}
 	} else {
 		ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs));
 		vdev_remove_complete(spa);
 	}
 
 	thread_exit();
 }
 
 void
 spa_vdev_remove_suspend(spa_t *spa)
 {
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 
 	if (svr == NULL)
 		return;
 
 	mutex_enter(&svr->svr_lock);
 	svr->svr_thread_exit = B_TRUE;
 	while (svr->svr_thread != NULL)
 		cv_wait(&svr->svr_cv, &svr->svr_lock);
 	svr->svr_thread_exit = B_FALSE;
 	mutex_exit(&svr->svr_lock);
 }
 
 /*
  * Return true if the "allocating" property has been set to "off"
  */
 static boolean_t
 vdev_prop_allocating_off(vdev_t *vd)
 {
 	uint64_t objid = vd->vdev_top_zap;
 	uint64_t allocating = 1;
 
 	/* no vdev property object => no props */
 	if (objid != 0) {
 		spa_t *spa = vd->vdev_spa;
 		objset_t *mos = spa->spa_meta_objset;
 
 		mutex_enter(&spa->spa_props_lock);
 		(void) zap_lookup(mos, objid, "allocating", sizeof (uint64_t),
 		    1, &allocating);
 		mutex_exit(&spa->spa_props_lock);
 	}
 	return (allocating == 0);
 }
 
 static int
 spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	if (spa->spa_vdev_removal == NULL)
 		return (ENOTACTIVE);
 	return (0);
 }
 
 /*
  * Cancel a removal by freeing all entries from the partial mapping
  * and marking the vdev as no longer being removing.
  */
 static void
 spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	objset_t *mos = spa->spa_meta_objset;
 
 	ASSERT3P(svr->svr_thread, ==, NULL);
 
 	spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
 
 	boolean_t are_precise;
 	VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 	if (are_precise) {
 		spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 		VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
 	}
 
 	uint64_t obsolete_sm_object;
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	if (obsolete_sm_object != 0) {
 		ASSERT(vd->vdev_obsolete_sm != NULL);
 		ASSERT3U(obsolete_sm_object, ==,
 		    space_map_object(vd->vdev_obsolete_sm));
 
 		space_map_free(vd->vdev_obsolete_sm, tx);
 		VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
 		space_map_close(vd->vdev_obsolete_sm);
 		vd->vdev_obsolete_sm = NULL;
 		spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 	}
 	for (int i = 0; i < TXG_SIZE; i++) {
 		ASSERT(list_is_empty(&svr->svr_new_segments[i]));
 		ASSERT3U(svr->svr_max_offset_to_sync[i], <=,
 		    vdev_indirect_mapping_max_offset(vim));
 	}
 
 	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
 		metaslab_t *msp = vd->vdev_ms[msi];
 
 		if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
 			break;
 
 		ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs));
 
 		mutex_enter(&msp->ms_lock);
 
 		/*
 		 * Assert nothing in flight -- ms_*tree is empty.
 		 */
 		for (int i = 0; i < TXG_SIZE; i++)
 			ASSERT0(zfs_range_tree_space(msp->ms_allocating[i]));
 		for (int i = 0; i < TXG_DEFER_SIZE; i++)
 			ASSERT0(zfs_range_tree_space(msp->ms_defer[i]));
 		ASSERT0(zfs_range_tree_space(msp->ms_freed));
 
 		if (msp->ms_sm != NULL) {
 			mutex_enter(&svr->svr_lock);
 			VERIFY0(space_map_load(msp->ms_sm,
 			    svr->svr_allocd_segs, SM_ALLOC));
 
 			zfs_range_tree_walk(msp->ms_unflushed_allocs,
 			    zfs_range_tree_add, svr->svr_allocd_segs);
 			zfs_range_tree_walk(msp->ms_unflushed_frees,
 			    zfs_range_tree_remove, svr->svr_allocd_segs);
 			zfs_range_tree_walk(msp->ms_freeing,
 			    zfs_range_tree_remove, svr->svr_allocd_segs);
 
 			/*
 			 * Clear everything past what has been synced,
 			 * because we have not allocated mappings for it yet.
 			 */
 			uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
 			uint64_t sm_end = msp->ms_sm->sm_start +
 			    msp->ms_sm->sm_size;
 			if (sm_end > syncd)
 				zfs_range_tree_clear(svr->svr_allocd_segs,
 				    syncd, sm_end - syncd);
 
 			mutex_exit(&svr->svr_lock);
 		}
 		mutex_exit(&msp->ms_lock);
 
 		mutex_enter(&svr->svr_lock);
 		zfs_range_tree_vacate(svr->svr_allocd_segs,
 		    free_mapped_segment_cb, vd);
 		mutex_exit(&svr->svr_lock);
 	}
 
 	/*
 	 * Note: this must happen after we invoke free_mapped_segment_cb,
 	 * because it adds to the obsolete_segments.
 	 */
 	zfs_range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
 
 	ASSERT3U(vic->vic_mapping_object, ==,
 	    vdev_indirect_mapping_object(vd->vdev_indirect_mapping));
 	vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
 	vd->vdev_indirect_mapping = NULL;
 	vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
 	vic->vic_mapping_object = 0;
 
 	ASSERT3U(vic->vic_births_object, ==,
 	    vdev_indirect_births_object(vd->vdev_indirect_births));
 	vdev_indirect_births_close(vd->vdev_indirect_births);
 	vd->vdev_indirect_births = NULL;
 	vdev_indirect_births_free(mos, vic->vic_births_object, tx);
 	vic->vic_births_object = 0;
 
 	/*
 	 * We may have processed some frees from the removing vdev in this
 	 * txg, thus increasing svr_bytes_done; discard that here to
 	 * satisfy the assertions in spa_vdev_removal_destroy().
 	 * Note that future txg's can not have any bytes_done, because
 	 * future TXG's are only modified from open context, and we have
 	 * already shut down the copying thread.
 	 */
 	svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0;
 	spa_finish_removal(spa, DSS_CANCELED, tx);
 
 	vd->vdev_removing = B_FALSE;
 
 	if (!vdev_prop_allocating_off(vd)) {
 		spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
 		vdev_activate(vd);
 		spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
 	}
 
 	vdev_config_dirty(vd);
 
 	zfs_dbgmsg("canceled device removal for vdev %llu in %llu",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx));
 	spa_history_log_internal(spa, "vdev remove canceled", tx,
 	    "%s vdev %llu %s", spa_name(spa),
 	    (u_longlong_t)vd->vdev_id,
 	    (vd->vdev_path != NULL) ? vd->vdev_path : "-");
 }
 
 static int
 spa_vdev_remove_cancel_impl(spa_t *spa)
 {
 	int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
 	    spa_vdev_remove_cancel_sync, NULL, 0,
 	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
 	return (error);
 }
 
 int
 spa_vdev_remove_cancel(spa_t *spa)
 {
 	spa_vdev_remove_suspend(spa);
 
 	if (spa->spa_vdev_removal == NULL)
 		return (ENOTACTIVE);
 
 	return (spa_vdev_remove_cancel_impl(spa));
 }
 
 void
 svr_sync(spa_t *spa, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 
 	if (svr == NULL)
 		return;
 
 	/*
 	 * This check is necessary so that we do not dirty the
 	 * DIRECTORY_OBJECT via spa_sync_removing_state() when there
 	 * is nothing to do.  Dirtying it every time would prevent us
 	 * from syncing-to-convergence.
 	 */
 	if (svr->svr_bytes_done[txgoff] == 0)
 		return;
 
 	/*
 	 * Update progress accounting.
 	 */
 	spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff];
 	svr->svr_bytes_done[txgoff] = 0;
 
 	spa_sync_removing_state(spa, tx);
 }
 
 static void
 vdev_remove_make_hole_and_free(vdev_t *vd)
 {
 	uint64_t id = vd->vdev_id;
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	vdev_free(vd);
 
 	vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
 	vdev_add_child(rvd, vd);
 	vdev_config_dirty(rvd);
 
 	/*
 	 * Reassess the health of our root vdev.
 	 */
 	vdev_reopen(rvd);
 }
 
 /*
  * Remove a log device.  The config lock is held for the specified TXG.
  */
 static int
 spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
 {
 	metaslab_group_t *mg = vd->vdev_mg;
 	spa_t *spa = vd->vdev_spa;
 	int error = 0;
 
 	ASSERT(vd->vdev_islog);
 	ASSERT(vd == vd->vdev_top);
 	ASSERT3P(vd->vdev_log_mg, ==, NULL);
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	/*
 	 * Stop allocating from this vdev.
 	 */
 	metaslab_group_passivate(mg);
 
 	/*
 	 * Wait for the youngest allocations and frees to sync,
 	 * and then wait for the deferral of those frees to finish.
 	 */
 	spa_vdev_config_exit(spa, NULL,
 	    *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 
 	/*
 	 * Cancel any initialize or TRIM which was in progress.
 	 */
 	vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
 	vdev_trim_stop_all(vd, VDEV_TRIM_CANCELED);
 	vdev_autotrim_stop_wait(vd);
 
 	/*
 	 * Evacuate the device.  We don't hold the config lock as
 	 * writer since we need to do I/O but we do keep the
 	 * spa_namespace_lock held.  Once this completes the device
 	 * should no longer have any blocks allocated on it.
 	 */
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	if (vd->vdev_stat.vs_alloc != 0)
 		error = spa_reset_logs(spa);
 
 	*txg = spa_vdev_config_enter(spa);
 
 	if (error != 0) {
 		metaslab_group_activate(mg);
 		ASSERT3P(vd->vdev_log_mg, ==, NULL);
 		return (error);
 	}
 	ASSERT0(vd->vdev_stat.vs_alloc);
 
 	/*
 	 * The evacuation succeeded.  Remove any remaining MOS metadata
 	 * associated with this vdev, and wait for these changes to sync.
 	 */
 	vd->vdev_removing = B_TRUE;
 
 	vdev_dirty_leaves(vd, VDD_DTL, *txg);
 	vdev_config_dirty(vd);
 
 	/*
 	 * When the log space map feature is enabled we look at
 	 * the vdev's top_zap to find the on-disk flush data of
 	 * the metaslab we just flushed. Thus, while removing a
 	 * log vdev we make sure to call vdev_metaslab_fini()
 	 * first, which removes all metaslabs of this vdev from
 	 * spa_metaslabs_by_flushed before vdev_remove_empty()
 	 * destroys the top_zap of this log vdev.
 	 *
 	 * This avoids the scenario where we flush a metaslab
 	 * from the log vdev being removed that doesn't have a
 	 * top_zap and end up failing to lookup its on-disk flush
 	 * data.
 	 *
 	 * We don't call metaslab_group_destroy() right away
 	 * though (it will be called in vdev_free() later) as
 	 * during metaslab_sync() of metaslabs from other vdevs
 	 * we may touch the metaslab group of this vdev through
 	 * metaslab_class_histogram_verify()
 	 */
 	vdev_metaslab_fini(vd);
 
 	spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
 	*txg = spa_vdev_config_enter(spa);
 
 	sysevent_t *ev = spa_event_create(spa, vd, NULL,
 	    ESC_ZFS_VDEV_REMOVE_DEV);
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/* The top ZAP should have been destroyed by vdev_remove_empty. */
 	ASSERT0(vd->vdev_top_zap);
 	/* The leaf ZAP should have been destroyed by vdev_dtl_sync. */
 	ASSERT0(vd->vdev_leaf_zap);
 
 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	if (list_link_active(&vd->vdev_state_dirty_node))
 		vdev_state_clean(vd);
 	if (list_link_active(&vd->vdev_config_dirty_node))
 		vdev_config_clean(vd);
 
 	ASSERT0(vd->vdev_stat.vs_alloc);
 
 	/*
 	 * Clean up the vdev namespace.
 	 */
 	vdev_remove_make_hole_and_free(vd);
 
 	if (ev != NULL)
 		spa_event_post(ev);
 
 	return (0);
 }
 
 static int
 spa_vdev_remove_top_check(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	if (vd != vd->vdev_top)
 		return (SET_ERROR(ENOTSUP));
 
 	if (!vdev_is_concrete(vd))
 		return (SET_ERROR(ENOTSUP));
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * This device is already being removed
 	 */
 	if (vd->vdev_removing)
 		return (SET_ERROR(EALREADY));
 
 	metaslab_class_t *mc = vd->vdev_mg->mg_class;
 	metaslab_class_t *normal = spa_normal_class(spa);
 	if (mc != normal) {
 		/*
 		 * Space allocated from the special (or dedup) class is
 		 * included in the DMU's space usage, but it's not included
 		 * in spa_dspace (or dsl_pool_adjustedsize()).  Therefore
 		 * there is always at least as much free space in the normal
 		 * class, as is allocated from the special (and dedup) class.
 		 * As a backup check, we will return ENOSPC if this is
 		 * violated. See also spa_update_dspace().
 		 */
 		uint64_t available = metaslab_class_get_space(normal) -
 		    metaslab_class_get_alloc(normal);
 		ASSERT3U(available, >=, vd->vdev_stat.vs_alloc);
 		if (available < vd->vdev_stat.vs_alloc)
 			return (SET_ERROR(ENOSPC));
 	} else if (!vd->vdev_noalloc) {
 		/* available space in the pool's normal class */
 		uint64_t available = dsl_dir_space_available(
 		    spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
 		if (available < vd->vdev_stat.vs_dspace)
 			return (SET_ERROR(ENOSPC));
 	}
 
 	/*
 	 * There can not be a removal in progress.
 	 */
 	if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * The device must have all its data.
 	 */
 	if (!vdev_dtl_empty(vd, DTL_MISSING) ||
 	    !vdev_dtl_empty(vd, DTL_OUTAGE))
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * The device must be healthy.
 	 */
 	if (!vdev_readable(vd))
 		return (SET_ERROR(EIO));
 
 	/*
 	 * All vdevs in normal class must have the same ashift.
 	 */
 	if (spa->spa_max_ashift != spa->spa_min_ashift) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * A removed special/dedup vdev must have same ashift as normal class.
 	 */
 	ASSERT(!vd->vdev_islog);
 	if (vd->vdev_alloc_bias != VDEV_BIAS_NONE &&
 	    vd->vdev_ashift != spa->spa_max_ashift) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * All vdevs in normal class must have the same ashift
 	 * and not be raidz or draid.
 	 */
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t id = 0; id < rvd->vdev_children; id++) {
 		vdev_t *cvd = rvd->vdev_child[id];
 
 		/*
 		 * A removed special/dedup vdev must have the same ashift
 		 * across all vdevs in its class.
 		 */
 		if (vd->vdev_alloc_bias != VDEV_BIAS_NONE &&
 		    cvd->vdev_alloc_bias == vd->vdev_alloc_bias &&
 		    cvd->vdev_ashift != vd->vdev_ashift) {
 			return (SET_ERROR(EINVAL));
 		}
 		if (cvd->vdev_ashift != 0 &&
 		    cvd->vdev_alloc_bias == VDEV_BIAS_NONE)
 			ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift);
 		if (!vdev_is_concrete(cvd))
 			continue;
 		if (vdev_get_nparity(cvd) != 0)
 			return (SET_ERROR(EINVAL));
 		/*
 		 * Need the mirror to be mirror of leaf vdevs only
 		 */
 		if (cvd->vdev_ops == &vdev_mirror_ops) {
 			for (uint64_t cid = 0;
 			    cid < cvd->vdev_children; cid++) {
 				if (!cvd->vdev_child[cid]->vdev_ops->
 				    vdev_op_leaf)
 					return (SET_ERROR(EINVAL));
 			}
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Initiate removal of a top-level vdev, reducing the total space in the pool.
  * The config lock is held for the specified TXG.  Once initiated,
  * evacuation of all allocated space (copying it to other vdevs) happens
  * in the background (see spa_vdev_remove_thread()), and can be canceled
  * (see spa_vdev_remove_cancel()).  If successful, the vdev will
  * be transformed to an indirect vdev (see spa_vdev_remove_complete()).
  */
 static int
 spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	boolean_t set_noalloc = B_FALSE;
 	int error;
 
 	/*
 	 * Check for errors up-front, so that we don't waste time
 	 * passivating the metaslab group and clearing the ZIL if there
 	 * are errors.
 	 */
 	error = spa_vdev_remove_top_check(vd);
 
 	/*
 	 * Stop allocating from this vdev.  Note that we must check
 	 * that this is not the only device in the pool before
 	 * passivating, otherwise we will not be able to make
 	 * progress because we can't allocate from any vdevs.
 	 * The above check for sufficient free space serves this
 	 * purpose.
 	 */
 	if (error == 0 && !vd->vdev_noalloc) {
 		set_noalloc = B_TRUE;
 		error = vdev_passivate(vd, txg);
 	}
 
 	if (error != 0)
 		return (error);
 
 	/*
 	 * We stop any initializing and TRIM that is currently in progress
 	 * but leave the state as "active". This will allow the process to
 	 * resume if the removal is canceled sometime later.
 	 */
 
 	spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
 
 	vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
 	vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE);
 	vdev_autotrim_stop_wait(vd);
 
 	*txg = spa_vdev_config_enter(spa);
 
 	/*
 	 * Things might have changed while the config lock was dropped
 	 * (e.g. space usage).  Check for errors again.
 	 */
 	error = spa_vdev_remove_top_check(vd);
 
 	if (error != 0) {
 		if (set_noalloc)
 			vdev_activate(vd);
 		spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
 		spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
 		spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
 		return (error);
 	}
 
 	vd->vdev_removing = B_TRUE;
 
 	vdev_dirty_leaves(vd, VDD_DTL, *txg);
 	vdev_config_dirty(vd);
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
 	dsl_sync_task_nowait(spa->spa_dsl_pool,
 	    vdev_remove_initiate_sync, (void *)(uintptr_t)vd->vdev_id, tx);
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
 /*
  * Remove a device from the pool.
  *
  * Removing a device from the vdev namespace requires several steps
  * and can take a significant amount of time.  As a result we use
  * the spa_vdev_config_[enter/exit] functions which allow us to
  * grab and release the spa_config_lock while still holding the namespace
  * lock.  During each step the configuration is synced out.
  */
 int
 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 {
 	vdev_t *vd;
 	nvlist_t **spares, **l2cache, *nv;
 	uint64_t txg = 0;
 	uint_t nspares, nl2cache;
 	int error = 0, error_log;
 	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
 	sysevent_t *ev = NULL;
 	const char *vd_type = NULL;
 	char *vd_path = NULL;
 
 	ASSERT(spa_writeable(spa));
 
 	if (!locked)
 		txg = spa_vdev_enter(spa);
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 
 		if (!locked)
 			return (spa_vdev_exit(spa, NULL, txg, error));
 
 		return (error);
 	}
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (spa->spa_spares.sav_vdevs != NULL &&
 	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
 	    (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
 		/*
 		 * Only remove the hot spare if it's not currently in use
 		 * in this pool.
 		 */
 		if (vd == NULL || unspare) {
 			const char *type;
 			boolean_t draid_spare = B_FALSE;
 
 			if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type)
 			    == 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
 				draid_spare = B_TRUE;
 
 			if (vd == NULL && draid_spare) {
 				error = SET_ERROR(ENOTSUP);
 			} else {
 				if (vd == NULL)
 					vd = spa_lookup_by_guid(spa,
 					    guid, B_TRUE);
 				ev = spa_event_create(spa, vd, NULL,
 				    ESC_ZFS_VDEV_REMOVE_AUX);
 
 				vd_type = VDEV_TYPE_SPARE;
 				vd_path = spa_strdup(fnvlist_lookup_string(
 				    nv, ZPOOL_CONFIG_PATH));
 				spa_vdev_remove_aux(spa->spa_spares.sav_config,
 				    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
 				spa_load_spares(spa);
 				spa->spa_spares.sav_sync = B_TRUE;
 			}
 		} else {
 			error = SET_ERROR(EBUSY);
 		}
 	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
 	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
 	    (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
 		vd_type = VDEV_TYPE_L2CACHE;
 		vd_path = spa_strdup(fnvlist_lookup_string(
 		    nv, ZPOOL_CONFIG_PATH));
 		/*
 		 * Cache devices can always be removed.
 		 */
 		vd = spa_lookup_by_guid(spa, guid, B_TRUE);
 
 		/*
 		 * Stop trimming the cache device. We need to release the
 		 * config lock to allow the syncing of TRIM transactions
 		 * without releasing the spa_namespace_lock. The same
 		 * strategy is employed in spa_vdev_remove_top().
 		 */
 		spa_vdev_config_exit(spa, NULL,
 		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 		mutex_enter(&vd->vdev_trim_lock);
 		vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL);
 		mutex_exit(&vd->vdev_trim_lock);
 		txg = spa_vdev_config_enter(spa);
 
 		ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX);
 		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
 		spa_load_l2cache(spa);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	} else if (vd != NULL && vd->vdev_islog) {
 		ASSERT(!locked);
 		vd_type = VDEV_TYPE_LOG;
 		vd_path = spa_strdup((vd->vdev_path != NULL) ?
 		    vd->vdev_path : "-");
 		error = spa_vdev_remove_log(vd, &txg);
 	} else if (vd != NULL) {
 		ASSERT(!locked);
 		error = spa_vdev_remove_top(vd, &txg);
 	} else {
 		/*
 		 * There is no vdev of any kind with the specified guid.
 		 */
 		error = SET_ERROR(ENOENT);
 	}
 
 	error_log = error;
 
 	if (!locked)
 		error = spa_vdev_exit(spa, NULL, txg, error);
 
 	/*
 	 * Logging must be done outside the spa config lock. Otherwise,
 	 * this code path could end up holding the spa config lock while
 	 * waiting for a txg_sync so it can write to the internal log.
 	 * Doing that would prevent the txg sync from actually happening,
 	 * causing a deadlock.
 	 */
 	if (error_log == 0 && vd_type != NULL && vd_path != NULL) {
 		spa_history_log_internal(spa, "vdev remove", NULL,
 		    "%s vdev (%s) %s", spa_name(spa), vd_type, vd_path);
 	}
 	if (vd_path != NULL)
 		spa_strfree(vd_path);
 
 	if (ev != NULL)
 		spa_event_post(ev);
 
 	return (error);
 }
 
 int
 spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
 {
 	prs->prs_state = spa->spa_removing_phys.sr_state;
 
 	if (prs->prs_state == DSS_NONE)
 		return (SET_ERROR(ENOENT));
 
 	prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev;
 	prs->prs_start_time = spa->spa_removing_phys.sr_start_time;
 	prs->prs_end_time = spa->spa_removing_phys.sr_end_time;
 	prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy;
 	prs->prs_copied = spa->spa_removing_phys.sr_copied;
 
 	prs->prs_mapping_memory = 0;
 	uint64_t indirect_vdev_id =
 	    spa->spa_removing_phys.sr_prev_indirect_vdev;
 	while (indirect_vdev_id != -1) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id];
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 		prs->prs_mapping_memory += vdev_indirect_mapping_size(vim);
 		indirect_vdev_id = vic->vic_prev_indirect_vdev;
 	}
 
 	return (0);
 }
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_ignore_errors, INT, ZMOD_RW,
 	"Ignore hard IO errors when removing device");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, UINT, ZMOD_RW,
 	"Largest contiguous segment to allocate when removing device");
 
 ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, UINT, ZMOD_RW,
 	"Largest span of free chunks a remap segment can span");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, UINT, ZMOD_RW,
 	"Pause device removal after this many bytes are copied "
 	"(debug use only - causes removal to hang)");
 
 EXPORT_SYMBOL(free_from_removing_vdev);
 EXPORT_SYMBOL(spa_removal_get_stats);
 EXPORT_SYMBOL(spa_remove_init);
 EXPORT_SYMBOL(spa_restart_removal);
 EXPORT_SYMBOL(spa_vdev_removal_destroy);
 EXPORT_SYMBOL(spa_vdev_remove);
 EXPORT_SYMBOL(spa_vdev_remove_cancel);
 EXPORT_SYMBOL(spa_vdev_remove_suspend);
 EXPORT_SYMBOL(svr_sync);
diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c
index d13753f81a69..1ca0b23c0ee4 100644
--- a/module/zfs/vdev_trim.c
+++ b/module/zfs/vdev_trim.c
@@ -1,1791 +1,1791 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2016, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2019 by Lawrence Livermore National Security, LLC.
  * Copyright (c) 2021 Hewlett Packard Enterprise Development LP
  * Copyright 2023 RackTop Systems, Inc.
  */
 
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/txg.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_trim.h>
 #include <sys/metaslab_impl.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zap.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc_impl.h>
 
 /*
  * TRIM is a feature which is used to notify a SSD that some previously
  * written space is no longer allocated by the pool.  This is useful because
  * writes to a SSD must be performed to blocks which have first been erased.
  * Ensuring the SSD always has a supply of erased blocks for new writes
  * helps prevent the performance from deteriorating.
  *
  * There are two supported TRIM methods; manual and automatic.
  *
  * Manual TRIM:
  *
  * A manual TRIM is initiated by running the 'zpool trim' command.  A single
  * 'vdev_trim' thread is created for each leaf vdev, and it is responsible for
  * managing that vdev TRIM process.  This involves iterating over all the
  * metaslabs, calculating the unallocated space ranges, and then issuing the
  * required TRIM I/Os.
  *
  * While a metaslab is being actively trimmed it is not eligible to perform
  * new allocations.  After traversing all of the metaslabs the thread is
  * terminated.  Finally, both the requested options and current progress of
  * the TRIM are regularly written to the pool.  This allows the TRIM to be
  * suspended and resumed as needed.
  *
  * Automatic TRIM:
  *
  * An automatic TRIM is enabled by setting the 'autotrim' pool property
  * to 'on'.  When enabled, a `vdev_autotrim' thread is created for each
  * top-level (not leaf) vdev in the pool.  These threads perform the same
  * core TRIM process as a manual TRIM, but with a few key differences.
  *
  * 1) Automatic TRIM happens continuously in the background and operates
  *    solely on recently freed blocks (ms_trim not ms_allocatable).
  *
  * 2) Each thread is associated with a top-level (not leaf) vdev.  This has
  *    the benefit of simplifying the threading model, it makes it easier
  *    to coordinate administrative commands, and it ensures only a single
  *    metaslab is disabled at a time.  Unlike manual TRIM, this means each
  *    'vdev_autotrim' thread is responsible for issuing TRIM I/Os for its
  *    children.
  *
  * 3) There is no automatic TRIM progress information stored on disk, nor
  *    is it reported by 'zpool status'.
  *
  * While the automatic TRIM process is highly effective it is more likely
  * than a manual TRIM to encounter tiny ranges.  Ranges less than or equal to
  * 'zfs_trim_extent_bytes_min' (32k) are considered too small to efficiently
  * TRIM and are skipped.  This means small amounts of freed space may not
  * be automatically trimmed.
  *
  * Furthermore, devices with attached hot spares and devices being actively
  * replaced are skipped.  This is done to avoid adding additional stress to
  * a potentially unhealthy device and to minimize the required rebuild time.
  *
  * For this reason it may be beneficial to occasionally manually TRIM a pool
  * even when automatic TRIM is enabled.
  */
 
 /*
  * Maximum size of TRIM I/O, ranges will be chunked in to 128MiB lengths.
  */
 static unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024;
 
 /*
  * Minimum size of TRIM I/O, extents smaller than 32Kib will be skipped.
  */
 static unsigned int zfs_trim_extent_bytes_min = 32 * 1024;
 
 /*
  * Skip uninitialized metaslabs during the TRIM process.  This option is
  * useful for pools constructed from large thinly-provisioned devices where
  * TRIM operations are slow.  As a pool ages an increasing fraction of
  * the pools metaslabs will be initialized progressively degrading the
  * usefulness of this option.  This setting is stored when starting a
  * manual TRIM and will persist for the duration of the requested TRIM.
  */
 unsigned int zfs_trim_metaslab_skip = 0;
 
 /*
  * Maximum number of queued TRIM I/Os per leaf vdev.  The number of
  * concurrent TRIM I/Os issued to the device is controlled by the
  * zfs_vdev_trim_min_active and zfs_vdev_trim_max_active module options.
  */
 static unsigned int zfs_trim_queue_limit = 10;
 
 /*
  * The minimum number of transaction groups between automatic trims of a
  * metaslab.  This setting represents a trade-off between issuing more
  * efficient TRIM operations, by allowing them to be aggregated longer,
  * and issuing them promptly so the trimmed space is available.  Note
  * that this value is a minimum; metaslabs can be trimmed less frequently
  * when there are a large number of ranges which need to be trimmed.
  *
  * Increasing this value will allow frees to be aggregated for a longer
  * time.  This can result is larger TRIM operations, and increased memory
  * usage in order to track the ranges to be trimmed.  Decreasing this value
  * has the opposite effect.  The default value of 32 was determined though
  * testing to be a reasonable compromise.
  */
 static unsigned int zfs_trim_txg_batch = 32;
 
 /*
  * The trim_args are a control structure which describe how a leaf vdev
  * should be trimmed.  The core elements are the vdev, the metaslab being
  * trimmed and a range tree containing the extents to TRIM.  All provided
  * ranges must be within the metaslab.
  */
 typedef struct trim_args {
 	/*
 	 * These fields are set by the caller of vdev_trim_ranges().
 	 */
 	vdev_t		*trim_vdev;		/* Leaf vdev to TRIM */
 	metaslab_t	*trim_msp;		/* Disabled metaslab */
 	zfs_range_tree_t	*trim_tree;	/* TRIM ranges (in metaslab) */
 	trim_type_t	trim_type;		/* Manual or auto TRIM */
 	uint64_t	trim_extent_bytes_max;	/* Maximum TRIM I/O size */
 	uint64_t	trim_extent_bytes_min;	/* Minimum TRIM I/O size */
 	enum trim_flag	trim_flags;		/* TRIM flags (secure) */
 
 	/*
 	 * These fields are updated by vdev_trim_ranges().
 	 */
 	hrtime_t	trim_start_time;	/* Start time */
 	uint64_t	trim_bytes_done;	/* Bytes trimmed */
 } trim_args_t;
 
 /*
  * Determines whether a vdev_trim_thread() should be stopped.
  */
 static boolean_t
 vdev_trim_should_stop(vdev_t *vd)
 {
 	return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) ||
 	    vd->vdev_detached || vd->vdev_top->vdev_removing ||
 	    vd->vdev_top->vdev_rz_expanding);
 }
 
 /*
  * Determines whether a vdev_autotrim_thread() should be stopped.
  */
 static boolean_t
 vdev_autotrim_should_stop(vdev_t *tvd)
 {
 	return (tvd->vdev_autotrim_exit_wanted ||
 	    !vdev_writeable(tvd) || tvd->vdev_removing ||
 	    tvd->vdev_rz_expanding ||
 	    spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF);
 }
 
 /*
  * Wait for given number of kicks, return true if the wait is aborted due to
  * vdev_autotrim_exit_wanted.
  */
 static boolean_t
 vdev_autotrim_wait_kick(vdev_t *vd, int num_of_kick)
 {
 	mutex_enter(&vd->vdev_autotrim_lock);
 	for (int i = 0; i < num_of_kick; i++) {
 		if (vd->vdev_autotrim_exit_wanted)
 			break;
 		cv_wait_idle(&vd->vdev_autotrim_kick_cv,
 		    &vd->vdev_autotrim_lock);
 	}
 	boolean_t exit_wanted = vd->vdev_autotrim_exit_wanted;
 	mutex_exit(&vd->vdev_autotrim_lock);
 
 	return (exit_wanted);
 }
 
 /*
  * The sync task for updating the on-disk state of a manual TRIM.  This
  * is scheduled by vdev_trim_change_state().
  */
 static void
 vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx)
 {
 	/*
 	 * We pass in the guid instead of the vdev_t since the vdev may
 	 * have been freed prior to the sync task being processed.  This
 	 * happens when a vdev is detached as we call spa_config_vdev_exit(),
 	 * stop the trimming thread, schedule the sync task, and free
 	 * the vdev. Later when the scheduled sync task is invoked, it would
 	 * find that the vdev has been freed.
 	 */
 	uint64_t guid = *(uint64_t *)arg;
 	uint64_t txg = dmu_tx_get_txg(tx);
 	kmem_free(arg, sizeof (uint64_t));
 
 	vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
 	if (vd == NULL || vd->vdev_top->vdev_removing ||
 	    !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding)
 		return;
 
 	uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK];
 	vd->vdev_trim_offset[txg & TXG_MASK] = 0;
 
 	VERIFY3U(vd->vdev_leaf_zap, !=, 0);
 
 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
 
 	if (last_offset > 0 || vd->vdev_trim_last_offset == UINT64_MAX) {
 
 		if (vd->vdev_trim_last_offset == UINT64_MAX)
 			last_offset = 0;
 
 		vd->vdev_trim_last_offset = last_offset;
 		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
 		    VDEV_LEAF_ZAP_TRIM_LAST_OFFSET,
 		    sizeof (last_offset), 1, &last_offset, tx));
 	}
 
 	if (vd->vdev_trim_action_time > 0) {
 		uint64_t val = (uint64_t)vd->vdev_trim_action_time;
 		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
 		    VDEV_LEAF_ZAP_TRIM_ACTION_TIME, sizeof (val),
 		    1, &val, tx));
 	}
 
 	if (vd->vdev_trim_rate > 0) {
 		uint64_t rate = (uint64_t)vd->vdev_trim_rate;
 
 		if (rate == UINT64_MAX)
 			rate = 0;
 
 		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
 		    VDEV_LEAF_ZAP_TRIM_RATE, sizeof (rate), 1, &rate, tx));
 	}
 
 	uint64_t partial = vd->vdev_trim_partial;
 	if (partial == UINT64_MAX)
 		partial = 0;
 
 	VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL,
 	    sizeof (partial), 1, &partial, tx));
 
 	uint64_t secure = vd->vdev_trim_secure;
 	if (secure == UINT64_MAX)
 		secure = 0;
 
 	VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE,
 	    sizeof (secure), 1, &secure, tx));
 
 
 	uint64_t trim_state = vd->vdev_trim_state;
 	VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE,
 	    sizeof (trim_state), 1, &trim_state, tx));
 }
 
 /*
  * Update the on-disk state of a manual TRIM.  This is called to request
  * that a TRIM be started/suspended/canceled, or to change one of the
  * TRIM options (partial, secure, rate).
  */
 static void
 vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
     uint64_t rate, boolean_t partial, boolean_t secure)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
 	spa_t *spa = vd->vdev_spa;
 
 	if (new_state == vd->vdev_trim_state)
 		return;
 
 	/*
 	 * Copy the vd's guid, this will be freed by the sync task.
 	 */
 	uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
 	*guid = vd->vdev_guid;
 
 	/*
 	 * If we're suspending, then preserve the original start time.
 	 */
 	if (vd->vdev_trim_state != VDEV_TRIM_SUSPENDED) {
 		vd->vdev_trim_action_time = gethrestime_sec();
 	}
 
 	/*
 	 * If we're activating, then preserve the requested rate and trim
 	 * method.  Setting the last offset and rate to UINT64_MAX is used
 	 * as a sentinel to indicate they should be reset to default values.
 	 */
 	if (new_state == VDEV_TRIM_ACTIVE) {
 		if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE ||
 		    vd->vdev_trim_state == VDEV_TRIM_CANCELED) {
 			vd->vdev_trim_last_offset = UINT64_MAX;
 			vd->vdev_trim_rate = UINT64_MAX;
 			vd->vdev_trim_partial = UINT64_MAX;
 			vd->vdev_trim_secure = UINT64_MAX;
 		}
 
 		if (rate != 0)
 			vd->vdev_trim_rate = rate;
 
 		if (partial != 0)
 			vd->vdev_trim_partial = partial;
 
 		if (secure != 0)
 			vd->vdev_trim_secure = secure;
 	}
 
 	vdev_trim_state_t old_state = vd->vdev_trim_state;
 	boolean_t resumed = (old_state == VDEV_TRIM_SUSPENDED);
 	vd->vdev_trim_state = new_state;
 
 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 	dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync,
 	    guid, tx);
 
 	switch (new_state) {
 	case VDEV_TRIM_ACTIVE:
 		spa_event_notify(spa, vd, NULL,
 		    resumed ? ESC_ZFS_TRIM_RESUME : ESC_ZFS_TRIM_START);
 		spa_history_log_internal(spa, "trim", tx,
 		    "vdev=%s activated", vd->vdev_path);
 		break;
 	case VDEV_TRIM_SUSPENDED:
 		spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_SUSPEND);
 		spa_history_log_internal(spa, "trim", tx,
 		    "vdev=%s suspended", vd->vdev_path);
 		break;
 	case VDEV_TRIM_CANCELED:
 		if (old_state == VDEV_TRIM_ACTIVE ||
 		    old_state == VDEV_TRIM_SUSPENDED) {
 			spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL);
 			spa_history_log_internal(spa, "trim", tx,
 			    "vdev=%s canceled", vd->vdev_path);
 		}
 		break;
 	case VDEV_TRIM_COMPLETE:
 		spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH);
 		spa_history_log_internal(spa, "trim", tx,
 		    "vdev=%s complete", vd->vdev_path);
 		break;
 	default:
 		panic("invalid state %llu", (unsigned long long)new_state);
 	}
 
 	dmu_tx_commit(tx);
 
 	if (new_state != VDEV_TRIM_ACTIVE)
 		spa_notify_waiters(spa);
 }
 
 /*
  * The zio_done_func_t done callback for each manual TRIM issued.  It is
  * responsible for updating the TRIM stats, reissuing failed TRIM I/Os,
  * and limiting the number of in flight TRIM I/Os.
  */
 static void
 vdev_trim_cb(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
 	mutex_enter(&vd->vdev_trim_io_lock);
 	if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
 		/*
 		 * The I/O failed because the vdev was unavailable; roll the
 		 * last offset back. (This works because spa_sync waits on
 		 * spa_txg_zio before it runs sync tasks.)
 		 */
 		uint64_t *offset =
 		    &vd->vdev_trim_offset[zio->io_txg & TXG_MASK];
 		*offset = MIN(*offset, zio->io_offset);
 	} else {
 		if (zio->io_error != 0) {
 			vd->vdev_stat.vs_trim_errors++;
 			spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL,
 			    0, 0, 0, 0, 1, zio->io_orig_size);
 		} else {
 			spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL,
 			    1, zio->io_orig_size, 0, 0, 0, 0);
 		}
 
 		vd->vdev_trim_bytes_done += zio->io_orig_size;
 	}
 
 	ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_MANUAL], >, 0);
 	vd->vdev_trim_inflight[TRIM_TYPE_MANUAL]--;
 	cv_broadcast(&vd->vdev_trim_io_cv);
 	mutex_exit(&vd->vdev_trim_io_lock);
 
 	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
 }
 
 /*
  * The zio_done_func_t done callback for each automatic TRIM issued.  It
  * is responsible for updating the TRIM stats and limiting the number of
  * in flight TRIM I/Os.  Automatic TRIM I/Os are best effort and are
  * never reissued on failure.
  */
 static void
 vdev_autotrim_cb(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
 	mutex_enter(&vd->vdev_trim_io_lock);
 
 	if (zio->io_error != 0) {
 		vd->vdev_stat.vs_trim_errors++;
 		spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO,
 		    0, 0, 0, 0, 1, zio->io_orig_size);
 	} else {
 		spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO,
 		    1, zio->io_orig_size, 0, 0, 0, 0);
 	}
 
 	ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_AUTO], >, 0);
 	vd->vdev_trim_inflight[TRIM_TYPE_AUTO]--;
 	cv_broadcast(&vd->vdev_trim_io_cv);
 	mutex_exit(&vd->vdev_trim_io_lock);
 
 	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
 }
 
 /*
  * The zio_done_func_t done callback for each TRIM issued via
  * vdev_trim_simple(). It is responsible for updating the TRIM stats and
  * limiting the number of in flight TRIM I/Os.  Simple TRIM I/Os are best
  * effort and are never reissued on failure.
  */
 static void
 vdev_trim_simple_cb(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
 	mutex_enter(&vd->vdev_trim_io_lock);
 
 	if (zio->io_error != 0) {
 		vd->vdev_stat.vs_trim_errors++;
 		spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE,
 		    0, 0, 0, 0, 1, zio->io_orig_size);
 	} else {
 		spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE,
 		    1, zio->io_orig_size, 0, 0, 0, 0);
 	}
 
 	ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE], >, 0);
 	vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE]--;
 	cv_broadcast(&vd->vdev_trim_io_cv);
 	mutex_exit(&vd->vdev_trim_io_lock);
 
 	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
 }
 /*
  * Returns the average trim rate in bytes/sec for the ta->trim_vdev.
  */
 static uint64_t
 vdev_trim_calculate_rate(trim_args_t *ta)
 {
 	return (ta->trim_bytes_done * 1000 /
 	    (NSEC2MSEC(gethrtime() - ta->trim_start_time) + 1));
 }
 
 /*
  * Issues a physical TRIM and takes care of rate limiting (bytes/sec)
  * and number of concurrent TRIM I/Os.
  */
 static int
 vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size)
 {
 	vdev_t *vd = ta->trim_vdev;
 	spa_t *spa = vd->vdev_spa;
 	void *cb;
 
 	mutex_enter(&vd->vdev_trim_io_lock);
 
 	/*
 	 * Limit manual TRIM I/Os to the requested rate.  This does not
 	 * apply to automatic TRIM since no per vdev rate can be specified.
 	 */
 	if (ta->trim_type == TRIM_TYPE_MANUAL) {
 		while (vd->vdev_trim_rate != 0 && !vdev_trim_should_stop(vd) &&
 		    vdev_trim_calculate_rate(ta) > vd->vdev_trim_rate) {
 			cv_timedwait_idle(&vd->vdev_trim_io_cv,
 			    &vd->vdev_trim_io_lock, ddi_get_lbolt() +
 			    MSEC_TO_TICK(10));
 		}
 	}
 	ta->trim_bytes_done += size;
 
 	/* Limit in flight trimming I/Os */
 	while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] +
 	    vd->vdev_trim_inflight[2] >= zfs_trim_queue_limit) {
 		cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
 	}
 	vd->vdev_trim_inflight[ta->trim_type]++;
 	mutex_exit(&vd->vdev_trim_io_lock);
 
 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 	uint64_t txg = dmu_tx_get_txg(tx);
 
 	spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
 	mutex_enter(&vd->vdev_trim_lock);
 
 	if (ta->trim_type == TRIM_TYPE_MANUAL &&
 	    vd->vdev_trim_offset[txg & TXG_MASK] == 0) {
 		uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
 		*guid = vd->vdev_guid;
 
 		/* This is the first write of this txg. */
 		dsl_sync_task_nowait(spa_get_dsl(spa),
 		    vdev_trim_zap_update_sync, guid, tx);
 	}
 
 	/*
 	 * We know the vdev_t will still be around since all consumers of
 	 * vdev_free must stop the trimming first.
 	 */
 	if ((ta->trim_type == TRIM_TYPE_MANUAL &&
 	    vdev_trim_should_stop(vd)) ||
 	    (ta->trim_type == TRIM_TYPE_AUTO &&
 	    vdev_autotrim_should_stop(vd->vdev_top))) {
 		mutex_enter(&vd->vdev_trim_io_lock);
 		vd->vdev_trim_inflight[ta->trim_type]--;
 		mutex_exit(&vd->vdev_trim_io_lock);
 		spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
 		mutex_exit(&vd->vdev_trim_lock);
 		dmu_tx_commit(tx);
 		return (SET_ERROR(EINTR));
 	}
 	mutex_exit(&vd->vdev_trim_lock);
 
 	if (ta->trim_type == TRIM_TYPE_MANUAL)
 		vd->vdev_trim_offset[txg & TXG_MASK] = start + size;
 
 	if (ta->trim_type == TRIM_TYPE_MANUAL) {
 		cb = vdev_trim_cb;
 	} else if (ta->trim_type == TRIM_TYPE_AUTO) {
 		cb = vdev_autotrim_cb;
 	} else {
 		cb = vdev_trim_simple_cb;
 	}
 
 	zio_nowait(zio_trim(spa->spa_txg_zio[txg & TXG_MASK], vd,
 	    start, size, cb, NULL, ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL,
 	    ta->trim_flags));
 	/* vdev_trim_cb and vdev_autotrim_cb release SCL_STATE_ALL */
 
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
 /*
  * Issues TRIM I/Os for all ranges in the provided ta->trim_tree range tree.
  * Additional parameters describing how the TRIM should be performed must
  * be set in the trim_args structure.  See the trim_args definition for
  * additional information.
  */
 static int
 vdev_trim_ranges(trim_args_t *ta)
 {
 	vdev_t *vd = ta->trim_vdev;
 	zfs_btree_t *t = &ta->trim_tree->rt_root;
 	zfs_btree_index_t idx;
 	uint64_t extent_bytes_max = ta->trim_extent_bytes_max;
 	uint64_t extent_bytes_min = ta->trim_extent_bytes_min;
 	spa_t *spa = vd->vdev_spa;
 	int error = 0;
 
 	ta->trim_start_time = gethrtime();
 	ta->trim_bytes_done = 0;
 
 	for (zfs_range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL;
 	    rs = zfs_btree_next(t, &idx, &idx)) {
 		uint64_t size = zfs_rs_get_end(rs, ta->trim_tree) -
 		    zfs_rs_get_start(rs, ta->trim_tree);
 
 		if (extent_bytes_min && size < extent_bytes_min) {
 			spa_iostats_trim_add(spa, ta->trim_type,
 			    0, 0, 1, size, 0, 0);
 			continue;
 		}
 
 		/* Split range into legally-sized physical chunks */
 		uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1;
 
 		for (uint64_t w = 0; w < writes_required; w++) {
 			error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE +
 			    zfs_rs_get_start(rs, ta->trim_tree) +
 			    (w *extent_bytes_max), MIN(size -
 			    (w * extent_bytes_max), extent_bytes_max));
 			if (error != 0) {
 				goto done;
 			}
 		}
 	}
 
 done:
 	/*
 	 * Make sure all TRIMs for this metaslab have completed before
 	 * returning. TRIM zios have lower priority over regular or syncing
 	 * zios, so all TRIM zios for this metaslab must complete before the
 	 * metaslab is re-enabled. Otherwise it's possible write zios to
 	 * this metaslab could cut ahead of still queued TRIM zios for this
 	 * metaslab causing corruption if the ranges overlap.
 	 */
 	mutex_enter(&vd->vdev_trim_io_lock);
 	while (vd->vdev_trim_inflight[0] > 0) {
 		cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
 	}
 	mutex_exit(&vd->vdev_trim_io_lock);
 
 	return (error);
 }
 
 static void
-vdev_trim_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
+vdev_trim_xlate_last_rs_end(void *arg, zfs_range_seg64_t *physical_rs)
 {
 	uint64_t *last_rs_end = (uint64_t *)arg;
 
 	if (physical_rs->rs_end > *last_rs_end)
 		*last_rs_end = physical_rs->rs_end;
 }
 
 static void
-vdev_trim_xlate_progress(void *arg, range_seg64_t *physical_rs)
+vdev_trim_xlate_progress(void *arg, zfs_range_seg64_t *physical_rs)
 {
 	vdev_t *vd = (vdev_t *)arg;
 
 	uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
 	vd->vdev_trim_bytes_est += size;
 
 	if (vd->vdev_trim_last_offset >= physical_rs->rs_end) {
 		vd->vdev_trim_bytes_done += size;
 	} else if (vd->vdev_trim_last_offset > physical_rs->rs_start &&
 	    vd->vdev_trim_last_offset <= physical_rs->rs_end) {
 		vd->vdev_trim_bytes_done +=
 		    vd->vdev_trim_last_offset - physical_rs->rs_start;
 	}
 }
 
 /*
  * Calculates the completion percentage of a manual TRIM.
  */
 static void
 vdev_trim_calculate_progress(vdev_t *vd)
 {
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
 	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
 	ASSERT(vd->vdev_leaf_zap != 0);
 
 	vd->vdev_trim_bytes_est = 0;
 	vd->vdev_trim_bytes_done = 0;
 
 	for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
 		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
 		mutex_enter(&msp->ms_lock);
 
 		uint64_t ms_free = (msp->ms_size -
 		    metaslab_allocated_space(msp)) /
 		    vdev_get_ndisks(vd->vdev_top);
 
 		/*
 		 * Convert the metaslab range to a physical range
 		 * on our vdev. We use this to determine if we are
 		 * in the middle of this metaslab range.
 		 */
-		range_seg64_t logical_rs, physical_rs, remain_rs;
+		zfs_range_seg64_t logical_rs, physical_rs, remain_rs;
 		logical_rs.rs_start = msp->ms_start;
 		logical_rs.rs_end = msp->ms_start + msp->ms_size;
 
 		/* Metaslab space after this offset has not been trimmed. */
 		vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
 		if (vd->vdev_trim_last_offset <= physical_rs.rs_start) {
 			vd->vdev_trim_bytes_est += ms_free;
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		/* Metaslab space before this offset has been trimmed */
 		uint64_t last_rs_end = physical_rs.rs_end;
 		if (!vdev_xlate_is_empty(&remain_rs)) {
 			vdev_xlate_walk(vd, &remain_rs,
 			    vdev_trim_xlate_last_rs_end, &last_rs_end);
 		}
 
 		if (vd->vdev_trim_last_offset > last_rs_end) {
 			vd->vdev_trim_bytes_done += ms_free;
 			vd->vdev_trim_bytes_est += ms_free;
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		/*
 		 * If we get here, we're in the middle of trimming this
 		 * metaslab.  Load it and walk the free tree for more
 		 * accurate progress estimation.
 		 */
 		VERIFY0(metaslab_load(msp));
 
 		zfs_range_tree_t *rt = msp->ms_allocatable;
 		zfs_btree_t *bt = &rt->rt_root;
 		zfs_btree_index_t idx;
 		for (zfs_range_seg_t *rs = zfs_btree_first(bt, &idx);
 		    rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) {
 			logical_rs.rs_start = zfs_rs_get_start(rs, rt);
 			logical_rs.rs_end = zfs_rs_get_end(rs, rt);
 
 			vdev_xlate_walk(vd, &logical_rs,
 			    vdev_trim_xlate_progress, vd);
 		}
 		mutex_exit(&msp->ms_lock);
 	}
 }
 
 /*
  * Load from disk the vdev's manual TRIM information.  This includes the
  * state, progress, and options provided when initiating the manual TRIM.
  */
 static int
 vdev_trim_load(vdev_t *vd)
 {
 	int err = 0;
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
 	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
 	ASSERT(vd->vdev_leaf_zap != 0);
 
 	if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE ||
 	    vd->vdev_trim_state == VDEV_TRIM_SUSPENDED) {
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_LAST_OFFSET,
 		    sizeof (vd->vdev_trim_last_offset), 1,
 		    &vd->vdev_trim_last_offset);
 		if (err == ENOENT) {
 			vd->vdev_trim_last_offset = 0;
 			err = 0;
 		}
 
 		if (err == 0) {
 			err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 			    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_RATE,
 			    sizeof (vd->vdev_trim_rate), 1,
 			    &vd->vdev_trim_rate);
 			if (err == ENOENT) {
 				vd->vdev_trim_rate = 0;
 				err = 0;
 			}
 		}
 
 		if (err == 0) {
 			err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 			    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL,
 			    sizeof (vd->vdev_trim_partial), 1,
 			    &vd->vdev_trim_partial);
 			if (err == ENOENT) {
 				vd->vdev_trim_partial = 0;
 				err = 0;
 			}
 		}
 
 		if (err == 0) {
 			err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 			    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE,
 			    sizeof (vd->vdev_trim_secure), 1,
 			    &vd->vdev_trim_secure);
 			if (err == ENOENT) {
 				vd->vdev_trim_secure = 0;
 				err = 0;
 			}
 		}
 	}
 
 	vdev_trim_calculate_progress(vd);
 
 	return (err);
 }
 
 static void
-vdev_trim_xlate_range_add(void *arg, range_seg64_t *physical_rs)
+vdev_trim_xlate_range_add(void *arg, zfs_range_seg64_t *physical_rs)
 {
 	trim_args_t *ta = arg;
 	vdev_t *vd = ta->trim_vdev;
 
 	/*
 	 * Only a manual trim will be traversing the vdev sequentially.
 	 * For an auto trim all valid ranges should be added.
 	 */
 	if (ta->trim_type == TRIM_TYPE_MANUAL) {
 
 		/* Only add segments that we have not visited yet */
 		if (physical_rs->rs_end <= vd->vdev_trim_last_offset)
 			return;
 
 		/* Pick up where we left off mid-range. */
 		if (vd->vdev_trim_last_offset > physical_rs->rs_start) {
 			ASSERT3U(physical_rs->rs_end, >,
 			    vd->vdev_trim_last_offset);
 			physical_rs->rs_start = vd->vdev_trim_last_offset;
 		}
 	}
 
 	ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
 
 	zfs_range_tree_add(ta->trim_tree, physical_rs->rs_start,
 	    physical_rs->rs_end - physical_rs->rs_start);
 }
 
 /*
  * Convert the logical range into physical ranges and add them to the
  * range tree passed in the trim_args_t.
  */
 static void
 vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
 {
 	trim_args_t *ta = arg;
 	vdev_t *vd = ta->trim_vdev;
-	range_seg64_t logical_rs;
+	zfs_range_seg64_t logical_rs;
 	logical_rs.rs_start = start;
 	logical_rs.rs_end = start + size;
 
 	/*
 	 * Every range to be trimmed must be part of ms_allocatable.
 	 * When ZFS_DEBUG_TRIM is set load the metaslab to verify this
 	 * is always the case.
 	 */
 	if (zfs_flags & ZFS_DEBUG_TRIM) {
 		metaslab_t *msp = ta->trim_msp;
 		VERIFY0(metaslab_load(msp));
 		VERIFY3B(msp->ms_loaded, ==, B_TRUE);
 		VERIFY(zfs_range_tree_contains(msp->ms_allocatable, start,
 		    size));
 	}
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 	vdev_xlate_walk(vd, &logical_rs, vdev_trim_xlate_range_add, arg);
 }
 
 /*
  * Each manual TRIM thread is responsible for trimming the unallocated
  * space for each leaf vdev.  This is accomplished by sequentially iterating
  * over its top-level metaslabs and issuing TRIM I/O for the space described
  * by its ms_allocatable.  While a metaslab is undergoing trimming it is
  * not eligible for new allocations.
  */
 static __attribute__((noreturn)) void
 vdev_trim_thread(void *arg)
 {
 	vdev_t *vd = arg;
 	spa_t *spa = vd->vdev_spa;
 	trim_args_t ta;
 	int error = 0;
 
 	/*
 	 * The VDEV_LEAF_ZAP_TRIM_* entries may have been updated by
 	 * vdev_trim().  Wait for the updated values to be reflected
 	 * in the zap in order to start with the requested settings.
 	 */
 	txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
 
 	ASSERT(vdev_is_concrete(vd));
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	vd->vdev_trim_last_offset = 0;
 	vd->vdev_trim_rate = 0;
 	vd->vdev_trim_partial = 0;
 	vd->vdev_trim_secure = 0;
 
 	VERIFY0(vdev_trim_load(vd));
 
 	ta.trim_vdev = vd;
 	ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
 	ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min;
 	ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
 	ta.trim_type = TRIM_TYPE_MANUAL;
 	ta.trim_flags = 0;
 
 	/*
 	 * When a secure TRIM has been requested infer that the intent
 	 * is that everything must be trimmed.  Override the default
 	 * minimum TRIM size to prevent ranges from being skipped.
 	 */
 	if (vd->vdev_trim_secure) {
 		ta.trim_flags |= ZIO_TRIM_SECURE;
 		ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
 	}
 
 	uint64_t ms_count = 0;
 	for (uint64_t i = 0; !vd->vdev_detached &&
 	    i < vd->vdev_top->vdev_ms_count; i++) {
 		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
 
 		/*
 		 * If we've expanded the top-level vdev or it's our
 		 * first pass, calculate our progress.
 		 */
 		if (vd->vdev_top->vdev_ms_count != ms_count) {
 			vdev_trim_calculate_progress(vd);
 			ms_count = vd->vdev_top->vdev_ms_count;
 		}
 
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		metaslab_disable(msp);
 		mutex_enter(&msp->ms_lock);
 		VERIFY0(metaslab_load(msp));
 
 		/*
 		 * If a partial TRIM was requested skip metaslabs which have
 		 * never been initialized and thus have never been written.
 		 */
 		if (msp->ms_sm == NULL && vd->vdev_trim_partial) {
 			mutex_exit(&msp->ms_lock);
 			metaslab_enable(msp, B_FALSE, B_FALSE);
 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 			vdev_trim_calculate_progress(vd);
 			continue;
 		}
 
 		ta.trim_msp = msp;
 		zfs_range_tree_walk(msp->ms_allocatable, vdev_trim_range_add,
 		    &ta);
 		zfs_range_tree_vacate(msp->ms_trim, NULL, NULL);
 		mutex_exit(&msp->ms_lock);
 
 		error = vdev_trim_ranges(&ta);
 		metaslab_enable(msp, B_TRUE, B_FALSE);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 		zfs_range_tree_vacate(ta.trim_tree, NULL, NULL);
 		if (error != 0)
 			break;
 	}
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	zfs_range_tree_destroy(ta.trim_tree);
 
 	mutex_enter(&vd->vdev_trim_lock);
 	if (!vd->vdev_trim_exit_wanted) {
 		if (vdev_writeable(vd)) {
 			vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE,
 			    vd->vdev_trim_rate, vd->vdev_trim_partial,
 			    vd->vdev_trim_secure);
 		} else if (vd->vdev_faulted) {
 			vdev_trim_change_state(vd, VDEV_TRIM_CANCELED,
 			    vd->vdev_trim_rate, vd->vdev_trim_partial,
 			    vd->vdev_trim_secure);
 		}
 	}
 	ASSERT(vd->vdev_trim_thread != NULL || vd->vdev_trim_inflight[0] == 0);
 
 	/*
 	 * Drop the vdev_trim_lock while we sync out the txg since it's
 	 * possible that a device might be trying to come online and must
 	 * check to see if it needs to restart a trim. That thread will be
 	 * holding the spa_config_lock which would prevent the txg_wait_synced
 	 * from completing.
 	 */
 	mutex_exit(&vd->vdev_trim_lock);
 	txg_wait_synced(spa_get_dsl(spa), 0);
 	mutex_enter(&vd->vdev_trim_lock);
 
 	vd->vdev_trim_thread = NULL;
 	cv_broadcast(&vd->vdev_trim_cv);
 	mutex_exit(&vd->vdev_trim_lock);
 
 	thread_exit();
 }
 
 /*
  * Initiates a manual TRIM for the vdev_t.  Callers must hold vdev_trim_lock,
  * the vdev_t must be a leaf and cannot already be manually trimming.
  */
 void
 vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT3P(vd->vdev_trim_thread, ==, NULL);
 	ASSERT(!vd->vdev_detached);
 	ASSERT(!vd->vdev_trim_exit_wanted);
 	ASSERT(!vd->vdev_top->vdev_removing);
 	ASSERT(!vd->vdev_rz_expanding);
 
 	vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure);
 	vd->vdev_trim_thread = thread_create(NULL, 0,
 	    vdev_trim_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
 }
 
 /*
  * Wait for the trimming thread to be terminated (canceled or stopped).
  */
 static void
 vdev_trim_stop_wait_impl(vdev_t *vd)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
 
 	while (vd->vdev_trim_thread != NULL)
 		cv_wait(&vd->vdev_trim_cv, &vd->vdev_trim_lock);
 
 	ASSERT3P(vd->vdev_trim_thread, ==, NULL);
 	vd->vdev_trim_exit_wanted = B_FALSE;
 }
 
 /*
  * Wait for vdev trim threads which were listed to cleanly exit.
  */
 void
 vdev_trim_stop_wait(spa_t *spa, list_t *vd_list)
 {
 	(void) spa;
 	vdev_t *vd;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 	    spa->spa_export_thread == curthread);
 
 	while ((vd = list_remove_head(vd_list)) != NULL) {
 		mutex_enter(&vd->vdev_trim_lock);
 		vdev_trim_stop_wait_impl(vd);
 		mutex_exit(&vd->vdev_trim_lock);
 	}
 }
 
 /*
  * Stop trimming a device, with the resultant trimming state being tgt_state.
  * For blocking behavior pass NULL for vd_list.  Otherwise, when a list_t is
  * provided the stopping vdev is inserted in to the list.  Callers are then
  * required to call vdev_trim_stop_wait() to block for all the trim threads
  * to exit.  The caller must hold vdev_trim_lock and must not be writing to
  * the spa config, as the trimming thread may try to enter the config as a
  * reader before exiting.
  */
 void
 vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list)
 {
 	ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER));
 	ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 	ASSERT(vdev_is_concrete(vd));
 
 	/*
 	 * Allow cancel requests to proceed even if the trim thread has
 	 * stopped.
 	 */
 	if (vd->vdev_trim_thread == NULL && tgt_state != VDEV_TRIM_CANCELED)
 		return;
 
 	vdev_trim_change_state(vd, tgt_state, 0, 0, 0);
 	vd->vdev_trim_exit_wanted = B_TRUE;
 
 	if (vd_list == NULL) {
 		vdev_trim_stop_wait_impl(vd);
 	} else {
 		ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 		    vd->vdev_spa->spa_export_thread == curthread);
 		list_insert_tail(vd_list, vd);
 	}
 }
 
 /*
  * Requests that all listed vdevs stop trimming.
  */
 static void
 vdev_trim_stop_all_impl(vdev_t *vd, vdev_trim_state_t tgt_state,
     list_t *vd_list)
 {
 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
 		mutex_enter(&vd->vdev_trim_lock);
 		vdev_trim_stop(vd, tgt_state, vd_list);
 		mutex_exit(&vd->vdev_trim_lock);
 		return;
 	}
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		vdev_trim_stop_all_impl(vd->vdev_child[i], tgt_state,
 		    vd_list);
 	}
 }
 
 /*
  * Convenience function to stop trimming of a vdev tree and set all trim
  * thread pointers to NULL.
  */
 void
 vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
 {
 	spa_t *spa = vd->vdev_spa;
 	list_t vd_list;
 	vdev_t *vd_l2cache;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 	    spa->spa_export_thread == curthread);
 
 	list_create(&vd_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_trim_node));
 
 	vdev_trim_stop_all_impl(vd, tgt_state, &vd_list);
 
 	/*
 	 * Iterate over cache devices and request stop trimming the
 	 * whole device in case we export the pool or remove the cache
 	 * device prematurely.
 	 */
 	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		vd_l2cache = spa->spa_l2cache.sav_vdevs[i];
 		vdev_trim_stop_all_impl(vd_l2cache, tgt_state, &vd_list);
 	}
 
 	vdev_trim_stop_wait(spa, &vd_list);
 
 	if (vd->vdev_spa->spa_sync_on) {
 		/* Make sure that our state has been synced to disk */
 		txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
 	}
 
 	list_destroy(&vd_list);
 }
 
 /*
  * Conditionally restarts a manual TRIM given its on-disk state.
  */
 void
 vdev_trim_restart(vdev_t *vd)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 	    vd->vdev_spa->spa_load_thread == curthread);
 	ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 
 	if (vd->vdev_leaf_zap != 0) {
 		mutex_enter(&vd->vdev_trim_lock);
 		uint64_t trim_state = VDEV_TRIM_NONE;
 		int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE,
 		    sizeof (trim_state), 1, &trim_state);
 		ASSERT(err == 0 || err == ENOENT);
 		vd->vdev_trim_state = trim_state;
 
 		uint64_t timestamp = 0;
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_ACTION_TIME,
 		    sizeof (timestamp), 1, &timestamp);
 		ASSERT(err == 0 || err == ENOENT);
 		vd->vdev_trim_action_time = timestamp;
 
 		if ((vd->vdev_trim_state == VDEV_TRIM_SUSPENDED ||
 		    vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) {
 			/* load progress for reporting, but don't resume */
 			VERIFY0(vdev_trim_load(vd));
 		} else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE &&
 		    vdev_writeable(vd) && !vd->vdev_top->vdev_removing &&
 		    !vd->vdev_top->vdev_rz_expanding &&
 		    vd->vdev_trim_thread == NULL) {
 			VERIFY0(vdev_trim_load(vd));
 			vdev_trim(vd, vd->vdev_trim_rate,
 			    vd->vdev_trim_partial, vd->vdev_trim_secure);
 		}
 
 		mutex_exit(&vd->vdev_trim_lock);
 	}
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		vdev_trim_restart(vd->vdev_child[i]);
 	}
 }
 
 /*
  * Used by the automatic TRIM when ZFS_DEBUG_TRIM is set to verify that
  * every TRIM range is contained within ms_allocatable.
  */
 static void
 vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size)
 {
 	trim_args_t *ta = arg;
 	metaslab_t *msp = ta->trim_msp;
 
 	VERIFY3B(msp->ms_loaded, ==, B_TRUE);
 	VERIFY3U(msp->ms_disabled, >, 0);
 	VERIFY(zfs_range_tree_contains(msp->ms_allocatable, start, size));
 }
 
 /*
  * Each automatic TRIM thread is responsible for managing the trimming of a
  * top-level vdev in the pool.  No automatic TRIM state is maintained on-disk.
  *
  * N.B. This behavior is different from a manual TRIM where a thread
  * is created for each leaf vdev, instead of each top-level vdev.
  */
 static __attribute__((noreturn)) void
 vdev_autotrim_thread(void *arg)
 {
 	vdev_t *vd = arg;
 	spa_t *spa = vd->vdev_spa;
 	int shift = 0;
 
 	mutex_enter(&vd->vdev_autotrim_lock);
 	ASSERT3P(vd->vdev_top, ==, vd);
 	ASSERT3P(vd->vdev_autotrim_thread, !=, NULL);
 	mutex_exit(&vd->vdev_autotrim_lock);
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	while (!vdev_autotrim_should_stop(vd)) {
 		int txgs_per_trim = MAX(zfs_trim_txg_batch, 1);
 		uint64_t extent_bytes_max = zfs_trim_extent_bytes_max;
 		uint64_t extent_bytes_min = zfs_trim_extent_bytes_min;
 
 		/*
 		 * All of the metaslabs are divided in to groups of size
 		 * num_metaslabs / zfs_trim_txg_batch.  Each of these groups
 		 * is composed of metaslabs which are spread evenly over the
 		 * device.
 		 *
 		 * For example, when zfs_trim_txg_batch = 32 (default) then
 		 * group 0 will contain metaslabs 0, 32, 64, ...;
 		 * group 1 will contain metaslabs 1, 33, 65, ...;
 		 * group 2 will contain metaslabs 2, 34, 66, ...; and so on.
 		 *
 		 * On each pass through the while() loop one of these groups
 		 * is selected.  This is accomplished by using a shift value
 		 * to select the starting metaslab, then striding over the
 		 * metaslabs using the zfs_trim_txg_batch size.  This is
 		 * done to accomplish two things.
 		 *
 		 * 1) By dividing the metaslabs in to groups, and making sure
 		 *    that each group takes a minimum of one txg to process.
 		 *    Then zfs_trim_txg_batch controls the minimum number of
 		 *    txgs which must occur before a metaslab is revisited.
 		 *
 		 * 2) Selecting non-consecutive metaslabs distributes the
 		 *    TRIM commands for a group evenly over the entire device.
 		 *    This can be advantageous for certain types of devices.
 		 */
 		for (uint64_t i = shift % txgs_per_trim; i < vd->vdev_ms_count;
 		    i += txgs_per_trim) {
 			metaslab_t *msp = vd->vdev_ms[i];
 			zfs_range_tree_t *trim_tree;
 			boolean_t issued_trim = B_FALSE;
 			boolean_t wait_aborted = B_FALSE;
 
 			spa_config_exit(spa, SCL_CONFIG, FTAG);
 			metaslab_disable(msp);
 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 			mutex_enter(&msp->ms_lock);
 
 			/*
 			 * Skip the metaslab when it has never been allocated
 			 * or when there are no recent frees to trim.
 			 */
 			if (msp->ms_sm == NULL ||
 			    zfs_range_tree_is_empty(msp->ms_trim)) {
 				mutex_exit(&msp->ms_lock);
 				metaslab_enable(msp, B_FALSE, B_FALSE);
 				continue;
 			}
 
 			/*
 			 * Skip the metaslab when it has already been disabled.
 			 * This may happen when a manual TRIM or initialize
 			 * operation is running concurrently.  In the case
 			 * of a manual TRIM, the ms_trim tree will have been
 			 * vacated.  Only ranges added after the manual TRIM
 			 * disabled the metaslab will be included in the tree.
 			 * These will be processed when the automatic TRIM
 			 * next revisits this metaslab.
 			 */
 			if (msp->ms_disabled > 1) {
 				mutex_exit(&msp->ms_lock);
 				metaslab_enable(msp, B_FALSE, B_FALSE);
 				continue;
 			}
 
 			/*
 			 * Allocate an empty range tree which is swapped in
 			 * for the existing ms_trim tree while it is processed.
 			 */
 			trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
 			    NULL, 0, 0);
 			zfs_range_tree_swap(&msp->ms_trim, &trim_tree);
 			ASSERT(zfs_range_tree_is_empty(msp->ms_trim));
 
 			/*
 			 * There are two cases when constructing the per-vdev
 			 * trim trees for a metaslab.  If the top-level vdev
 			 * has no children then it is also a leaf and should
 			 * be trimmed.  Otherwise our children are the leaves
 			 * and a trim tree should be constructed for each.
 			 */
 			trim_args_t *tap;
 			uint64_t children = vd->vdev_children;
 			if (children == 0) {
 				children = 1;
 				tap = kmem_zalloc(sizeof (trim_args_t) *
 				    children, KM_SLEEP);
 				tap[0].trim_vdev = vd;
 			} else {
 				tap = kmem_zalloc(sizeof (trim_args_t) *
 				    children, KM_SLEEP);
 
 				for (uint64_t c = 0; c < children; c++) {
 					tap[c].trim_vdev = vd->vdev_child[c];
 				}
 			}
 
 			for (uint64_t c = 0; c < children; c++) {
 				trim_args_t *ta = &tap[c];
 				vdev_t *cvd = ta->trim_vdev;
 
 				ta->trim_msp = msp;
 				ta->trim_extent_bytes_max = extent_bytes_max;
 				ta->trim_extent_bytes_min = extent_bytes_min;
 				ta->trim_type = TRIM_TYPE_AUTO;
 				ta->trim_flags = 0;
 
 				if (cvd->vdev_detached ||
 				    !vdev_writeable(cvd) ||
 				    !cvd->vdev_has_trim ||
 				    cvd->vdev_trim_thread != NULL) {
 					continue;
 				}
 
 				/*
 				 * When a device has an attached hot spare, or
 				 * is being replaced it will not be trimmed.
 				 * This is done to avoid adding additional
 				 * stress to a potentially unhealthy device,
 				 * and to minimize the required rebuild time.
 				 */
 				if (!cvd->vdev_ops->vdev_op_leaf)
 					continue;
 
 				ta->trim_tree = zfs_range_tree_create(NULL,
 				    ZFS_RANGE_SEG64, NULL, 0, 0);
 				zfs_range_tree_walk(trim_tree,
 				    vdev_trim_range_add, ta);
 			}
 
 			mutex_exit(&msp->ms_lock);
 			spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 			/*
 			 * Issue the TRIM I/Os for all ranges covered by the
 			 * TRIM trees.  These ranges are safe to TRIM because
 			 * no new allocations will be performed until the call
 			 * to metaslab_enabled() below.
 			 */
 			for (uint64_t c = 0; c < children; c++) {
 				trim_args_t *ta = &tap[c];
 
 				/*
 				 * Always yield to a manual TRIM if one has
 				 * been started for the child vdev.
 				 */
 				if (ta->trim_tree == NULL ||
 				    ta->trim_vdev->vdev_trim_thread != NULL) {
 					continue;
 				}
 
 				/*
 				 * After this point metaslab_enable() must be
 				 * called with the sync flag set.  This is done
 				 * here because vdev_trim_ranges() is allowed
 				 * to be interrupted (EINTR) before issuing all
 				 * of the required TRIM I/Os.
 				 */
 				issued_trim = B_TRUE;
 
 				int error = vdev_trim_ranges(ta);
 				if (error)
 					break;
 			}
 
 			/*
 			 * Verify every range which was trimmed is still
 			 * contained within the ms_allocatable tree.
 			 */
 			if (zfs_flags & ZFS_DEBUG_TRIM) {
 				mutex_enter(&msp->ms_lock);
 				VERIFY0(metaslab_load(msp));
 				VERIFY3P(tap[0].trim_msp, ==, msp);
 				zfs_range_tree_walk(trim_tree,
 				    vdev_trim_range_verify, &tap[0]);
 				mutex_exit(&msp->ms_lock);
 			}
 
 			zfs_range_tree_vacate(trim_tree, NULL, NULL);
 			zfs_range_tree_destroy(trim_tree);
 
 			/*
 			 * Wait for couples of kicks, to ensure the trim io is
 			 * synced. If the wait is aborted due to
 			 * vdev_autotrim_exit_wanted, we need to signal
 			 * metaslab_enable() to wait for sync.
 			 */
 			if (issued_trim) {
 				wait_aborted = vdev_autotrim_wait_kick(vd,
 				    TXG_CONCURRENT_STATES + TXG_DEFER_SIZE);
 			}
 
 			metaslab_enable(msp, wait_aborted, B_FALSE);
 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 			for (uint64_t c = 0; c < children; c++) {
 				trim_args_t *ta = &tap[c];
 
 				if (ta->trim_tree == NULL)
 					continue;
 
 				zfs_range_tree_vacate(ta->trim_tree, NULL,
 				    NULL);
 				zfs_range_tree_destroy(ta->trim_tree);
 			}
 
 			kmem_free(tap, sizeof (trim_args_t) * children);
 
 			if (vdev_autotrim_should_stop(vd))
 				break;
 		}
 
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 		vdev_autotrim_wait_kick(vd, 1);
 
 		shift++;
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	}
 
 	for (uint64_t c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 		mutex_enter(&cvd->vdev_trim_io_lock);
 
 		while (cvd->vdev_trim_inflight[1] > 0) {
 			cv_wait(&cvd->vdev_trim_io_cv,
 			    &cvd->vdev_trim_io_lock);
 		}
 		mutex_exit(&cvd->vdev_trim_io_lock);
 	}
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	/*
 	 * When exiting because the autotrim property was set to off, then
 	 * abandon any unprocessed ms_trim ranges to reclaim the memory.
 	 */
 	if (spa_get_autotrim(spa) == SPA_AUTOTRIM_OFF) {
 		for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
 			metaslab_t *msp = vd->vdev_ms[i];
 
 			mutex_enter(&msp->ms_lock);
 			zfs_range_tree_vacate(msp->ms_trim, NULL, NULL);
 			mutex_exit(&msp->ms_lock);
 		}
 	}
 
 	mutex_enter(&vd->vdev_autotrim_lock);
 	ASSERT(vd->vdev_autotrim_thread != NULL);
 	vd->vdev_autotrim_thread = NULL;
 	cv_broadcast(&vd->vdev_autotrim_cv);
 	mutex_exit(&vd->vdev_autotrim_lock);
 
 	thread_exit();
 }
 
 /*
  * Starts an autotrim thread, if needed, for each top-level vdev which can be
  * trimmed.  A top-level vdev which has been evacuated will never be trimmed.
  */
 void
 vdev_autotrim(spa_t *spa)
 {
 	vdev_t *root_vd = spa->spa_root_vdev;
 
 	for (uint64_t i = 0; i < root_vd->vdev_children; i++) {
 		vdev_t *tvd = root_vd->vdev_child[i];
 
 		mutex_enter(&tvd->vdev_autotrim_lock);
 		if (vdev_writeable(tvd) && !tvd->vdev_removing &&
 		    tvd->vdev_autotrim_thread == NULL &&
 		    !tvd->vdev_rz_expanding) {
 			ASSERT3P(tvd->vdev_top, ==, tvd);
 
 			tvd->vdev_autotrim_thread = thread_create(NULL, 0,
 			    vdev_autotrim_thread, tvd, 0, &p0, TS_RUN,
 			    maxclsyspri);
 			ASSERT(tvd->vdev_autotrim_thread != NULL);
 		}
 		mutex_exit(&tvd->vdev_autotrim_lock);
 	}
 }
 
 /*
  * Wait for the vdev_autotrim_thread associated with the passed top-level
  * vdev to be terminated (canceled or stopped).
  */
 void
 vdev_autotrim_stop_wait(vdev_t *tvd)
 {
 	mutex_enter(&tvd->vdev_autotrim_lock);
 	if (tvd->vdev_autotrim_thread != NULL) {
 		tvd->vdev_autotrim_exit_wanted = B_TRUE;
 		cv_broadcast(&tvd->vdev_autotrim_kick_cv);
 		cv_wait(&tvd->vdev_autotrim_cv,
 		    &tvd->vdev_autotrim_lock);
 
 		ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL);
 		tvd->vdev_autotrim_exit_wanted = B_FALSE;
 	}
 	mutex_exit(&tvd->vdev_autotrim_lock);
 }
 
 void
 vdev_autotrim_kick(spa_t *spa)
 {
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	vdev_t *root_vd = spa->spa_root_vdev;
 	vdev_t *tvd;
 
 	for (uint64_t i = 0; i < root_vd->vdev_children; i++) {
 		tvd = root_vd->vdev_child[i];
 
 		mutex_enter(&tvd->vdev_autotrim_lock);
 		if (tvd->vdev_autotrim_thread != NULL)
 			cv_broadcast(&tvd->vdev_autotrim_kick_cv);
 		mutex_exit(&tvd->vdev_autotrim_lock);
 	}
 }
 
 /*
  * Wait for all of the vdev_autotrim_thread associated with the pool to
  * be terminated (canceled or stopped).
  */
 void
 vdev_autotrim_stop_all(spa_t *spa)
 {
 	vdev_t *root_vd = spa->spa_root_vdev;
 
 	for (uint64_t i = 0; i < root_vd->vdev_children; i++)
 		vdev_autotrim_stop_wait(root_vd->vdev_child[i]);
 }
 
 /*
  * Conditionally restart all of the vdev_autotrim_thread's for the pool.
  */
 void
 vdev_autotrim_restart(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 	    spa->spa_load_thread == curthread);
 	if (spa->spa_autotrim)
 		vdev_autotrim(spa);
 }
 
 static __attribute__((noreturn)) void
 vdev_trim_l2arc_thread(void *arg)
 {
 	vdev_t		*vd = arg;
 	spa_t		*spa = vd->vdev_spa;
 	l2arc_dev_t	*dev = l2arc_vdev_get(vd);
 	trim_args_t	ta = {0};
-	range_seg64_t 	physical_rs;
+	zfs_range_seg64_t 	physical_rs;
 
 	ASSERT(vdev_is_concrete(vd));
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	vd->vdev_trim_last_offset = 0;
 	vd->vdev_trim_rate = 0;
 	vd->vdev_trim_partial = 0;
 	vd->vdev_trim_secure = 0;
 
 	ta.trim_vdev = vd;
 	ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
 	ta.trim_type = TRIM_TYPE_MANUAL;
 	ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
 	ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
 	ta.trim_flags = 0;
 
 	physical_rs.rs_start = vd->vdev_trim_bytes_done = 0;
 	physical_rs.rs_end = vd->vdev_trim_bytes_est =
 	    vdev_get_min_asize(vd);
 
 	zfs_range_tree_add(ta.trim_tree, physical_rs.rs_start,
 	    physical_rs.rs_end - physical_rs.rs_start);
 
 	mutex_enter(&vd->vdev_trim_lock);
 	vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0);
 	mutex_exit(&vd->vdev_trim_lock);
 
 	(void) vdev_trim_ranges(&ta);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 	mutex_enter(&vd->vdev_trim_io_lock);
 	while (vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] > 0) {
 		cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
 	}
 	mutex_exit(&vd->vdev_trim_io_lock);
 
 	zfs_range_tree_vacate(ta.trim_tree, NULL, NULL);
 	zfs_range_tree_destroy(ta.trim_tree);
 
 	mutex_enter(&vd->vdev_trim_lock);
 	if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) {
 		vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE,
 		    vd->vdev_trim_rate, vd->vdev_trim_partial,
 		    vd->vdev_trim_secure);
 	}
 	ASSERT(vd->vdev_trim_thread != NULL ||
 	    vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] == 0);
 
 	/*
 	 * Drop the vdev_trim_lock while we sync out the txg since it's
 	 * possible that a device might be trying to come online and
 	 * must check to see if it needs to restart a trim. That thread
 	 * will be holding the spa_config_lock which would prevent the
 	 * txg_wait_synced from completing. Same strategy as in
 	 * vdev_trim_thread().
 	 */
 	mutex_exit(&vd->vdev_trim_lock);
 	txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
 	mutex_enter(&vd->vdev_trim_lock);
 
 	/*
 	 * Update the header of the cache device here, before
 	 * broadcasting vdev_trim_cv which may lead to the removal
 	 * of the device. The same applies for setting l2ad_trim_all to
 	 * false.
 	 */
 	spa_config_enter(vd->vdev_spa, SCL_L2ARC, vd,
 	    RW_READER);
 	memset(dev->l2ad_dev_hdr, 0, dev->l2ad_dev_hdr_asize);
 	l2arc_dev_hdr_update(dev);
 	spa_config_exit(vd->vdev_spa, SCL_L2ARC, vd);
 
 	vd->vdev_trim_thread = NULL;
 	if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE)
 		dev->l2ad_trim_all = B_FALSE;
 
 	cv_broadcast(&vd->vdev_trim_cv);
 	mutex_exit(&vd->vdev_trim_lock);
 
 	thread_exit();
 }
 
 /*
  * Punches out TRIM threads for the L2ARC devices in a spa and assigns them
  * to vd->vdev_trim_thread variable. This facilitates the management of
  * trimming the whole cache device using TRIM_TYPE_MANUAL upon addition
  * to a pool or pool creation or when the header of the device is invalid.
  */
 void
 vdev_trim_l2arc(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	/*
 	 * Locate the spa's l2arc devices and kick off TRIM threads.
 	 */
 	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		vdev_t *vd = spa->spa_l2cache.sav_vdevs[i];
 		l2arc_dev_t *dev = l2arc_vdev_get(vd);
 
 		if (dev == NULL || !dev->l2ad_trim_all) {
 			/*
 			 * Don't attempt TRIM if the vdev is UNAVAIL or if the
 			 * cache device was not marked for whole device TRIM
 			 * (ie l2arc_trim_ahead = 0, or the L2ARC device header
 			 * is valid with trim_state = VDEV_TRIM_COMPLETE and
 			 * l2ad_log_entries > 0).
 			 */
 			continue;
 		}
 
 		mutex_enter(&vd->vdev_trim_lock);
 		ASSERT(vd->vdev_ops->vdev_op_leaf);
 		ASSERT(vdev_is_concrete(vd));
 		ASSERT3P(vd->vdev_trim_thread, ==, NULL);
 		ASSERT(!vd->vdev_detached);
 		ASSERT(!vd->vdev_trim_exit_wanted);
 		ASSERT(!vd->vdev_top->vdev_removing);
 		vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0);
 		vd->vdev_trim_thread = thread_create(NULL, 0,
 		    vdev_trim_l2arc_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
 		mutex_exit(&vd->vdev_trim_lock);
 	}
 }
 
 /*
  * A wrapper which calls vdev_trim_ranges(). It is intended to be called
  * on leaf vdevs.
  */
 int
 vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size)
 {
 	trim_args_t ta = {0};
-	range_seg64_t physical_rs;
+	zfs_range_seg64_t physical_rs;
 	int error;
 	physical_rs.rs_start = start;
 	physical_rs.rs_end = start + size;
 
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 	ASSERT(!vd->vdev_detached);
 	ASSERT(!vd->vdev_top->vdev_removing);
 	ASSERT(!vd->vdev_top->vdev_rz_expanding);
 
 	ta.trim_vdev = vd;
 	ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
 	ta.trim_type = TRIM_TYPE_SIMPLE;
 	ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
 	ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
 	ta.trim_flags = 0;
 
 	ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
 
 	if (physical_rs.rs_end > physical_rs.rs_start) {
 		zfs_range_tree_add(ta.trim_tree, physical_rs.rs_start,
 		    physical_rs.rs_end - physical_rs.rs_start);
 	} else {
 		ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
 	}
 
 	error = vdev_trim_ranges(&ta);
 
 	mutex_enter(&vd->vdev_trim_io_lock);
 	while (vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE] > 0) {
 		cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
 	}
 	mutex_exit(&vd->vdev_trim_io_lock);
 
 	zfs_range_tree_vacate(ta.trim_tree, NULL, NULL);
 	zfs_range_tree_destroy(ta.trim_tree);
 
 	return (error);
 }
 
 EXPORT_SYMBOL(vdev_trim);
 EXPORT_SYMBOL(vdev_trim_stop);
 EXPORT_SYMBOL(vdev_trim_stop_all);
 EXPORT_SYMBOL(vdev_trim_stop_wait);
 EXPORT_SYMBOL(vdev_trim_restart);
 EXPORT_SYMBOL(vdev_autotrim);
 EXPORT_SYMBOL(vdev_autotrim_stop_all);
 EXPORT_SYMBOL(vdev_autotrim_stop_wait);
 EXPORT_SYMBOL(vdev_autotrim_restart);
 EXPORT_SYMBOL(vdev_trim_l2arc);
 EXPORT_SYMBOL(vdev_trim_simple);
 
 ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_max, UINT, ZMOD_RW,
 	"Max size of TRIM commands, larger will be split");
 
 ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_min, UINT, ZMOD_RW,
 	"Min size of TRIM commands, smaller will be skipped");
 
 ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, metaslab_skip, UINT, ZMOD_RW,
 	"Skip metaslabs which have never been initialized");
 
 ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, txg_batch, UINT, ZMOD_RW,
 	"Min number of txgs to aggregate frees before issuing TRIM");
 
 ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, queue_limit, UINT, ZMOD_RW,
 	"Max queued TRIMs outstanding per leaf vdev");