diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 4880c8048726..449b6bf2ccb3 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -1,9473 +1,9474 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Nexenta Systems, Inc.
  * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
  * Copyright (c) 2015, 2017, Intel Corporation.
  * Copyright (c) 2020 Datto Inc.
  * Copyright (c) 2020, The FreeBSD Foundation [1]
  *
  * [1] Portions of this software were developed by Allan Jude
  *     under sponsorship from the FreeBSD Foundation.
  * Copyright (c) 2021 Allan Jude
  * Copyright (c) 2021 Toomas Soome <tsoome@me.com>
  * Copyright (c) 2023, Klara Inc.
  * Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
  */
 
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <getopt.h>
 #include <openssl/evp.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_sa.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/dbuf.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/stat.h>
 #include <sys/resource.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_traverse.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/zfs_fuid.h>
 #include <sys/arc.h>
 #include <sys/arc_impl.h>
 #include <sys/ddt.h>
 #include <sys/ddt_impl.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/blkptr.h>
 #include <sys/dsl_crypt.h>
 #include <sys/dsl_scan.h>
 #include <sys/btree.h>
 #include <sys/brt.h>
 #include <sys/brt_impl.h>
 #include <zfs_comutil.h>
 #include <sys/zstd/zstd.h>
 
 #include <libnvpair.h>
 #include <libzutil.h>
 
 #include <libzdb.h>
 
 #include "zdb.h"
 
 
 extern int reference_tracking_enable;
 extern int zfs_recover;
 extern uint_t zfs_vdev_async_read_max_active;
 extern boolean_t spa_load_verify_dryrun;
 extern boolean_t spa_mode_readable_spacemaps;
 extern uint_t zfs_reconstruct_indirect_combinations_max;
 extern uint_t zfs_btree_verify_intensity;
 
 static const char cmdname[] = "zdb";
 uint8_t dump_opt[256];
 
 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
 
 static uint64_t *zopt_metaslab = NULL;
 static unsigned zopt_metaslab_args = 0;
 
 
 static zopt_object_range_t *zopt_object_ranges = NULL;
 static unsigned zopt_object_args = 0;
 
 static int flagbits[256];
 
 
 static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */
 static int leaked_objects = 0;
 static range_tree_t *mos_refd_objs;
 
 static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
     boolean_t);
 static void mos_obj_refd(uint64_t);
 static void mos_obj_refd_multiple(uint64_t);
 static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
     dmu_tx_t *tx);
 
 
 
 static void zdb_print_blkptr(const blkptr_t *bp, int flags);
 
 typedef struct sublivelist_verify_block_refcnt {
 	/* block pointer entry in livelist being verified */
 	blkptr_t svbr_blk;
 
 	/*
 	 * Refcount gets incremented to 1 when we encounter the first
 	 * FREE entry for the svfbr block pointer and a node for it
 	 * is created in our ZDB verification/tracking metadata.
 	 *
 	 * As we encounter more FREE entries we increment this counter
 	 * and similarly decrement it whenever we find the respective
 	 * ALLOC entries for this block.
 	 *
 	 * When the refcount gets to 0 it means that all the FREE and
 	 * ALLOC entries of this block have paired up and we no longer
 	 * need to track it in our verification logic (e.g. the node
 	 * containing this struct in our verification data structure
 	 * should be freed).
 	 *
 	 * [refer to sublivelist_verify_blkptr() for the actual code]
 	 */
 	uint32_t svbr_refcnt;
 } sublivelist_verify_block_refcnt_t;
 
 static int
 sublivelist_block_refcnt_compare(const void *larg, const void *rarg)
 {
 	const sublivelist_verify_block_refcnt_t *l = larg;
 	const sublivelist_verify_block_refcnt_t *r = rarg;
 	return (livelist_compare(&l->svbr_blk, &r->svbr_blk));
 }
 
 static int
 sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
     dmu_tx_t *tx)
 {
 	ASSERT3P(tx, ==, NULL);
 	struct sublivelist_verify *sv = arg;
 	sublivelist_verify_block_refcnt_t current = {
 			.svbr_blk = *bp,
 
 			/*
 			 * Start with 1 in case this is the first free entry.
 			 * This field is not used for our B-Tree comparisons
 			 * anyway.
 			 */
 			.svbr_refcnt = 1,
 	};
 
 	zfs_btree_index_t where;
 	sublivelist_verify_block_refcnt_t *pair =
 	    zfs_btree_find(&sv->sv_pair, &current, &where);
 	if (free) {
 		if (pair == NULL) {
 			/* first free entry for this block pointer */
 			zfs_btree_add(&sv->sv_pair, &current);
 		} else {
 			pair->svbr_refcnt++;
 		}
 	} else {
 		if (pair == NULL) {
 			/* block that is currently marked as allocated */
 			for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
 				if (DVA_IS_EMPTY(&bp->blk_dva[i]))
 					break;
 				sublivelist_verify_block_t svb = {
 				    .svb_dva = bp->blk_dva[i],
-				    .svb_allocated_txg = bp->blk_birth
+				    .svb_allocated_txg =
+				    BP_GET_LOGICAL_BIRTH(bp)
 				};
 
 				if (zfs_btree_find(&sv->sv_leftover, &svb,
 				    &where) == NULL) {
 					zfs_btree_add_idx(&sv->sv_leftover,
 					    &svb, &where);
 				}
 			}
 		} else {
 			/* alloc matches a free entry */
 			pair->svbr_refcnt--;
 			if (pair->svbr_refcnt == 0) {
 				/* all allocs and frees have been matched */
 				zfs_btree_remove_idx(&sv->sv_pair, &where);
 			}
 		}
 	}
 
 	return (0);
 }
 
 static int
 sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
 {
 	int err;
 	struct sublivelist_verify *sv = args;
 
 	zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL,
 	    sizeof (sublivelist_verify_block_refcnt_t));
 
 	err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
 	    sv, NULL);
 
 	sublivelist_verify_block_refcnt_t *e;
 	zfs_btree_index_t *cookie = NULL;
 	while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
 		    &e->svbr_blk, B_TRUE);
 		(void) printf("\tERROR: %d unmatched FREE(s): %s\n",
 		    e->svbr_refcnt, blkbuf);
 	}
 	zfs_btree_destroy(&sv->sv_pair);
 
 	return (err);
 }
 
 static int
 livelist_block_compare(const void *larg, const void *rarg)
 {
 	const sublivelist_verify_block_t *l = larg;
 	const sublivelist_verify_block_t *r = rarg;
 
 	if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva))
 		return (-1);
 	else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva))
 		return (+1);
 
 	if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva))
 		return (-1);
 	else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva))
 		return (+1);
 
 	if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva))
 		return (-1);
 	else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva))
 		return (+1);
 
 	return (0);
 }
 
 /*
  * Check for errors in a livelist while tracking all unfreed ALLOCs in the
  * sublivelist_verify_t: sv->sv_leftover
  */
 static void
 livelist_verify(dsl_deadlist_t *dl, void *arg)
 {
 	sublivelist_verify_t *sv = arg;
 	dsl_deadlist_iterate(dl, sublivelist_verify_func, sv);
 }
 
 /*
  * Check for errors in the livelist entry and discard the intermediary
  * data structures
  */
 static int
 sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle)
 {
 	(void) args;
 	sublivelist_verify_t sv;
 	zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
 	    sizeof (sublivelist_verify_block_t));
 	int err = sublivelist_verify_func(&sv, dle);
 	zfs_btree_clear(&sv.sv_leftover);
 	zfs_btree_destroy(&sv.sv_leftover);
 	return (err);
 }
 
 typedef struct metaslab_verify {
 	/*
 	 * Tree containing all the leftover ALLOCs from the livelists
 	 * that are part of this metaslab.
 	 */
 	zfs_btree_t mv_livelist_allocs;
 
 	/*
 	 * Metaslab information.
 	 */
 	uint64_t mv_vdid;
 	uint64_t mv_msid;
 	uint64_t mv_start;
 	uint64_t mv_end;
 
 	/*
 	 * What's currently allocated for this metaslab.
 	 */
 	range_tree_t *mv_allocated;
 } metaslab_verify_t;
 
 typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);
 
 typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg,
     void *arg);
 
 typedef struct unflushed_iter_cb_arg {
 	spa_t *uic_spa;
 	uint64_t uic_txg;
 	void *uic_arg;
 	zdb_log_sm_cb_t uic_cb;
 } unflushed_iter_cb_arg_t;
 
 static int
 iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)
 {
 	unflushed_iter_cb_arg_t *uic = arg;
 	return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));
 }
 
 static void
 iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
 		space_map_t *sm = NULL;
 		VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
 		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
 
 		unflushed_iter_cb_arg_t uic = {
 			.uic_spa = spa,
 			.uic_txg = sls->sls_txg,
 			.uic_arg = arg,
 			.uic_cb = cb
 		};
 		VERIFY0(space_map_iterate(sm, space_map_length(sm),
 		    iterate_through_spacemap_logs_cb, &uic));
 		space_map_close(sm);
 	}
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 static void
 verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg,
     uint64_t offset, uint64_t size)
 {
 	sublivelist_verify_block_t svb = {{{0}}};
 	DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid);
 	DVA_SET_OFFSET(&svb.svb_dva, offset);
 	DVA_SET_ASIZE(&svb.svb_dva, size);
 	zfs_btree_index_t where;
 	uint64_t end_offset = offset + size;
 
 	/*
 	 *  Look for an exact match for spacemap entry in the livelist entries.
 	 *  Then, look for other livelist entries that fall within the range
 	 *  of the spacemap entry as it may have been condensed
 	 */
 	sublivelist_verify_block_t *found =
 	    zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where);
 	if (found == NULL) {
 		found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where);
 	}
 	for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid &&
 	    DVA_GET_OFFSET(&found->svb_dva) < end_offset;
 	    found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
 		if (found->svb_allocated_txg <= txg) {
 			(void) printf("ERROR: Livelist ALLOC [%llx:%llx] "
 			    "from TXG %llx FREED at TXG %llx\n",
 			    (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva),
 			    (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva),
 			    (u_longlong_t)found->svb_allocated_txg,
 			    (u_longlong_t)txg);
 		}
 	}
 }
 
 static int
 metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg)
 {
 	metaslab_verify_t *mv = arg;
 	uint64_t offset = sme->sme_offset;
 	uint64_t size = sme->sme_run;
 	uint64_t txg = sme->sme_txg;
 
 	if (sme->sme_type == SM_ALLOC) {
 		if (range_tree_contains(mv->mv_allocated,
 		    offset, size)) {
 			(void) printf("ERROR: DOUBLE ALLOC: "
 			    "%llu [%llx:%llx] "
 			    "%llu:%llu LOG_SM\n",
 			    (u_longlong_t)txg, (u_longlong_t)offset,
 			    (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
 			    (u_longlong_t)mv->mv_msid);
 		} else {
 			range_tree_add(mv->mv_allocated,
 			    offset, size);
 		}
 	} else {
 		if (!range_tree_contains(mv->mv_allocated,
 		    offset, size)) {
 			(void) printf("ERROR: DOUBLE FREE: "
 			    "%llu [%llx:%llx] "
 			    "%llu:%llu LOG_SM\n",
 			    (u_longlong_t)txg, (u_longlong_t)offset,
 			    (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
 			    (u_longlong_t)mv->mv_msid);
 		} else {
 			range_tree_remove(mv->mv_allocated,
 			    offset, size);
 		}
 	}
 
 	if (sme->sme_type != SM_ALLOC) {
 		/*
 		 * If something is freed in the spacemap, verify that
 		 * it is not listed as allocated in the livelist.
 		 */
 		verify_livelist_allocs(mv, txg, offset, size);
 	}
 	return (0);
 }
 
 static int
 spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme,
     uint64_t txg, void *arg)
 {
 	metaslab_verify_t *mv = arg;
 	uint64_t offset = sme->sme_offset;
 	uint64_t vdev_id = sme->sme_vdev;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 
 	/* skip indirect vdevs */
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	if (vdev_id != mv->mv_vdid)
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	if (ms->ms_id != mv->mv_msid)
 		return (0);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 
 	ASSERT3U(txg, ==, sme->sme_txg);
 	return (metaslab_spacemap_validation_cb(sme, mv));
 }
 
 static void
 spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv)
 {
 	iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv);
 }
 
 static void
 spacemap_check_ms_sm(space_map_t  *sm, metaslab_verify_t *mv)
 {
 	if (sm == NULL)
 		return;
 
 	VERIFY0(space_map_iterate(sm, space_map_length(sm),
 	    metaslab_spacemap_validation_cb, mv));
 }
 
 static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg);
 
 /*
  * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if
  * they are part of that metaslab (mv_msid).
  */
 static void
 mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
 {
 	zfs_btree_index_t where;
 	sublivelist_verify_block_t *svb;
 	ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0);
 	for (svb = zfs_btree_first(&sv->sv_leftover, &where);
 	    svb != NULL;
 	    svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) {
 		if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid)
 			continue;
 
 		if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start &&
 		    (DVA_GET_OFFSET(&svb->svb_dva) +
 		    DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) {
 			(void) printf("ERROR: Found block that crosses "
 			    "metaslab boundary: <%llu:%llx:%llx>\n",
 			    (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
 			    (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
 			    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
 			continue;
 		}
 
 		if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start)
 			continue;
 
 		if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end)
 			continue;
 
 		if ((DVA_GET_OFFSET(&svb->svb_dva) +
 		    DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) {
 			(void) printf("ERROR: Found block that crosses "
 			    "metaslab boundary: <%llu:%llx:%llx>\n",
 			    (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
 			    (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
 			    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
 			continue;
 		}
 
 		zfs_btree_add(&mv->mv_livelist_allocs, svb);
 	}
 
 	for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where);
 	    svb != NULL;
 	    svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
 		zfs_btree_remove(&sv->sv_leftover, svb);
 	}
 }
 
 /*
  * [Livelist Check]
  * Iterate through all the sublivelists and:
  * - report leftover frees (**)
  * - record leftover ALLOCs together with their TXG [see Cross Check]
  *
  * (**) Note: Double ALLOCs are valid in datasets that have dedup
  *      enabled. Similarly double FREEs are allowed as well but
  *      only if they pair up with a corresponding ALLOC entry once
  *      we our done with our sublivelist iteration.
  *
  * [Spacemap Check]
  * for each metaslab:
  * - iterate over spacemap and then the metaslab's entries in the
  *   spacemap log, then report any double FREEs and ALLOCs (do not
  *   blow up).
  *
  * [Cross Check]
  * After finishing the Livelist Check phase and while being in the
  * Spacemap Check phase, we find all the recorded leftover ALLOCs
  * of the livelist check that are part of the metaslab that we are
  * currently looking at in the Spacemap Check. We report any entries
  * that are marked as ALLOCs in the livelists but have been actually
  * freed (and potentially allocated again) after their TXG stamp in
  * the spacemaps. Also report any ALLOCs from the livelists that
  * belong to indirect vdevs (e.g. their vdev completed removal).
  *
  * Note that this will miss Log Spacemap entries that cancelled each other
  * out before being flushed to the metaslab, so we are not guaranteed
  * to match all erroneous ALLOCs.
  */
 static void
 livelist_metaslab_validate(spa_t *spa)
 {
 	(void) printf("Verifying deleted livelist entries\n");
 
 	sublivelist_verify_t sv;
 	zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
 	    sizeof (sublivelist_verify_block_t));
 	iterate_deleted_livelists(spa, livelist_verify, &sv);
 
 	(void) printf("Verifying metaslab entries\n");
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		if (!vdev_is_concrete(vd))
 			continue;
 
 		for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) {
 			metaslab_t *m = vd->vdev_ms[mid];
 
 			(void) fprintf(stderr,
 			    "\rverifying concrete vdev %llu, "
 			    "metaslab %llu of %llu ...",
 			    (longlong_t)vd->vdev_id,
 			    (longlong_t)mid,
 			    (longlong_t)vd->vdev_ms_count);
 
 			uint64_t shift, start;
 			range_seg_type_t type =
 			    metaslab_calculate_range_tree_type(vd, m,
 			    &start, &shift);
 			metaslab_verify_t mv;
 			mv.mv_allocated = range_tree_create(NULL,
 			    type, NULL, start, shift);
 			mv.mv_vdid = vd->vdev_id;
 			mv.mv_msid = m->ms_id;
 			mv.mv_start = m->ms_start;
 			mv.mv_end = m->ms_start + m->ms_size;
 			zfs_btree_create(&mv.mv_livelist_allocs,
 			    livelist_block_compare, NULL,
 			    sizeof (sublivelist_verify_block_t));
 
 			mv_populate_livelist_allocs(&mv, &sv);
 
 			spacemap_check_ms_sm(m->ms_sm, &mv);
 			spacemap_check_sm_log(spa, &mv);
 
 			range_tree_vacate(mv.mv_allocated, NULL, NULL);
 			range_tree_destroy(mv.mv_allocated);
 			zfs_btree_clear(&mv.mv_livelist_allocs);
 			zfs_btree_destroy(&mv.mv_livelist_allocs);
 		}
 	}
 	(void) fprintf(stderr, "\n");
 
 	/*
 	 * If there are any segments in the leftover tree after we walked
 	 * through all the metaslabs in the concrete vdevs then this means
 	 * that we have segments in the livelists that belong to indirect
 	 * vdevs and are marked as allocated.
 	 */
 	if (zfs_btree_numnodes(&sv.sv_leftover) == 0) {
 		zfs_btree_destroy(&sv.sv_leftover);
 		return;
 	}
 	(void) printf("ERROR: Found livelist blocks marked as allocated "
 	    "for indirect vdevs:\n");
 
 	zfs_btree_index_t *where = NULL;
 	sublivelist_verify_block_t *svb;
 	while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) !=
 	    NULL) {
 		int vdev_id = DVA_GET_VDEV(&svb->svb_dva);
 		ASSERT3U(vdev_id, <, rvd->vdev_children);
 		vdev_t *vd = rvd->vdev_child[vdev_id];
 		ASSERT(!vdev_is_concrete(vd));
 		(void) printf("<%d:%llx:%llx> TXG %llx\n",
 		    vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
 		    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva),
 		    (u_longlong_t)svb->svb_allocated_txg);
 	}
 	(void) printf("\n");
 	zfs_btree_destroy(&sv.sv_leftover);
 }
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
  */
 const char *
 _umem_debug_init(void)
 {
 	return ("default,verbose"); /* $UMEM_DEBUG setting */
 }
 
 const char *
 _umem_logging_init(void)
 {
 	return ("fail,contents"); /* $UMEM_LOGGING setting */
 }
 
 static void
 usage(void)
 {
 	(void) fprintf(stderr,
 	    "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] "
 	    "[-I <inflight I/Os>]\n"
 	    "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
 	    "\t\t[-K <key>]\n"
 	    "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n"
 	    "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] [-K <key>]\n"
 	    "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n"
 	    "\t%s -B [-e [-V] [-p <path> ...]] [-I <inflight I/Os>]\n"
 	    "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
 	    "\t\t[-K <key>] <poolname>/<objset id> [<backupflags>]\n"
 	    "\t%s [-v] <bookmark>\n"
 	    "\t%s -C [-A] [-U <cache>] [<poolname>]\n"
 	    "\t%s -l [-Aqu] <device>\n"
 	    "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
 	    "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
 	    "\t%s -O [-K <key>] <dataset> <path>\n"
 	    "\t%s -r [-K <key>] <dataset> <path> <destination>\n"
 	    "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
 	    "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
 	    "\t%s -E [-A] word0:word1:...:word15\n"
 	    "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
 	    "<poolname>\n\n",
 	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
 	    cmdname, cmdname, cmdname, cmdname, cmdname);
 
 	(void) fprintf(stderr, "    Dataset name must include at least one "
 	    "separator character '/' or '@'\n");
 	(void) fprintf(stderr, "    If dataset name is specified, only that "
 	    "dataset is dumped\n");
 	(void) fprintf(stderr,  "    If object numbers or object number "
 	    "ranges are specified, only those\n"
 	    "    objects or ranges are dumped.\n\n");
 	(void) fprintf(stderr,
 	    "    Object ranges take the form <start>:<end>[:<flags>]\n"
 	    "        start    Starting object number\n"
 	    "        end      Ending object number, or -1 for no upper bound\n"
 	    "        flags    Optional flags to select object types:\n"
 	    "            A     All objects (this is the default)\n"
 	    "            d     ZFS directories\n"
 	    "            f     ZFS files \n"
 	    "            m     SPA space maps\n"
 	    "            z     ZAPs\n"
 	    "            -     Negate effect of next flag\n\n");
 	(void) fprintf(stderr, "    Options to control amount of output:\n");
 	(void) fprintf(stderr, "        -b --block-stats             "
 	    "block statistics\n");
 	(void) fprintf(stderr, "        -B --backup                  "
 	    "backup stream\n");
 	(void) fprintf(stderr, "        -c --checksum                "
 	    "checksum all metadata (twice for all data) blocks\n");
 	(void) fprintf(stderr, "        -C --config                  "
 	    "config (or cachefile if alone)\n");
 	(void) fprintf(stderr, "        -d --datasets                "
 	    "dataset(s)\n");
 	(void) fprintf(stderr, "        -D --dedup-stats             "
 	    "dedup statistics\n");
 	(void) fprintf(stderr, "        -E --embedded-block-pointer=INTEGER\n"
 	    "                                     decode and display block "
 	    "from an embedded block pointer\n");
 	(void) fprintf(stderr, "        -h --history                 "
 	    "pool history\n");
 	(void) fprintf(stderr, "        -i --intent-logs             "
 	    "intent logs\n");
 	(void) fprintf(stderr, "        -l --label                   "
 	    "read label contents\n");
 	(void) fprintf(stderr, "        -k --checkpointed-state      "
 	    "examine the checkpointed state of the pool\n");
 	(void) fprintf(stderr, "        -L --disable-leak-tracking   "
 	    "disable leak tracking (do not load spacemaps)\n");
 	(void) fprintf(stderr, "        -m --metaslabs               "
 	    "metaslabs\n");
 	(void) fprintf(stderr, "        -M --metaslab-groups         "
 	    "metaslab groups\n");
 	(void) fprintf(stderr, "        -O --object-lookups          "
 	    "perform object lookups by path\n");
 	(void) fprintf(stderr, "        -r --copy-object             "
 	    "copy an object by path to file\n");
 	(void) fprintf(stderr, "        -R --read-block              "
 	    "read and display block from a device\n");
 	(void) fprintf(stderr, "        -s --io-stats                "
 	    "report stats on zdb's I/O\n");
 	(void) fprintf(stderr, "        -S --simulate-dedup          "
 	    "simulate dedup to measure effect\n");
 	(void) fprintf(stderr, "        -v --verbose                 "
 	    "verbose (applies to all others)\n");
 	(void) fprintf(stderr, "        -y --livelist                "
 	    "perform livelist and metaslab validation on any livelists being "
 	    "deleted\n\n");
 	(void) fprintf(stderr, "    Below options are intended for use "
 	    "with other options:\n");
 	(void) fprintf(stderr, "        -A --ignore-assertions       "
 	    "ignore assertions (-A), enable panic recovery (-AA) or both "
 	    "(-AAA)\n");
 	(void) fprintf(stderr, "        -e --exported                "
 	    "pool is exported/destroyed/has altroot/not in a cachefile\n");
 	(void) fprintf(stderr, "        -F --automatic-rewind        "
 	    "attempt automatic rewind within safe range of transaction "
 	    "groups\n");
 	(void) fprintf(stderr, "        -G --dump-debug-msg          "
 	    "dump zfs_dbgmsg buffer before exiting\n");
 	(void) fprintf(stderr, "        -I --inflight=INTEGER        "
 	    "specify the maximum number of checksumming I/Os "
 	    "[default is 200]\n");
 	(void) fprintf(stderr, "        -K --key=KEY                 "
 	    "decryption key for encrypted dataset\n");
 	(void) fprintf(stderr, "        -o --option=\"OPTION=INTEGER\" "
 	    "set global variable to an unsigned 32-bit integer\n");
 	(void) fprintf(stderr, "        -p --path==PATH              "
 	    "use one or more with -e to specify path to vdev dir\n");
 	(void) fprintf(stderr, "        -P --parseable               "
 	    "print numbers in parseable form\n");
 	(void) fprintf(stderr, "        -q --skip-label              "
 	    "don't print label contents\n");
 	(void) fprintf(stderr, "        -t --txg=INTEGER             "
 	    "highest txg to use when searching for uberblocks\n");
 	(void) fprintf(stderr, "        -T --brt-stats               "
 	    "BRT statistics\n");
 	(void) fprintf(stderr, "        -u --uberblock               "
 	    "uberblock\n");
 	(void) fprintf(stderr, "        -U --cachefile=PATH          "
 	    "use alternate cachefile\n");
 	(void) fprintf(stderr, "        -V --verbatim                "
 	    "do verbatim import\n");
 	(void) fprintf(stderr, "        -x --dump-blocks=PATH        "
 	    "dump all read blocks into specified directory\n");
 	(void) fprintf(stderr, "        -X --extreme-rewind          "
 	    "attempt extreme rewind (does not work with dataset)\n");
 	(void) fprintf(stderr, "        -Y --all-reconstruction      "
 	    "attempt all reconstruction combinations for split blocks\n");
 	(void) fprintf(stderr, "        -Z --zstd-headers            "
 	    "show ZSTD headers \n");
 	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
 	    "to make only that option verbose\n");
 	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
 	exit(1);
 }
 
 static void
 dump_debug_buffer(void)
 {
 	if (dump_opt['G']) {
 		(void) printf("\n");
 		(void) fflush(stdout);
 		zfs_dbgmsg_print("zdb");
 	}
 }
 
 /*
  * Called for usage errors that are discovered after a call to spa_open(),
  * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
  */
 
 static void
 fatal(const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	(void) fprintf(stderr, "%s: ", cmdname);
 	(void) vfprintf(stderr, fmt, ap);
 	va_end(ap);
 	(void) fprintf(stderr, "\n");
 
 	dump_debug_buffer();
 
 	exit(1);
 }
 
 static void
 dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) size;
 	nvlist_t *nv;
 	size_t nvsize = *(uint64_t *)data;
 	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
 
 	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
 
 	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
 
 	umem_free(packed, nvsize);
 
 	dump_nvlist(nv, 8);
 
 	nvlist_free(nv);
 }
 
 static void
 dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) size;
 	spa_history_phys_t *shp = data;
 
 	if (shp == NULL)
 		return;
 
 	(void) printf("\t\tpool_create_len = %llu\n",
 	    (u_longlong_t)shp->sh_pool_create_len);
 	(void) printf("\t\tphys_max_off = %llu\n",
 	    (u_longlong_t)shp->sh_phys_max_off);
 	(void) printf("\t\tbof = %llu\n",
 	    (u_longlong_t)shp->sh_bof);
 	(void) printf("\t\teof = %llu\n",
 	    (u_longlong_t)shp->sh_eof);
 	(void) printf("\t\trecords_lost = %llu\n",
 	    (u_longlong_t)shp->sh_records_lost);
 }
 
 static void
 zdb_nicenum(uint64_t num, char *buf, size_t buflen)
 {
 	if (dump_opt['P'])
 		(void) snprintf(buf, buflen, "%llu", (longlong_t)num);
 	else
 		nicenum(num, buf, buflen);
 }
 
 static void
 zdb_nicebytes(uint64_t bytes, char *buf, size_t buflen)
 {
 	if (dump_opt['P'])
 		(void) snprintf(buf, buflen, "%llu", (longlong_t)bytes);
 	else
 		zfs_nicebytes(bytes, buf, buflen);
 }
 
 static const char histo_stars[] = "****************************************";
 static const uint64_t histo_width = sizeof (histo_stars) - 1;
 
 static void
 dump_histogram(const uint64_t *histo, int size, int offset)
 {
 	int i;
 	int minidx = size - 1;
 	int maxidx = 0;
 	uint64_t max = 0;
 
 	for (i = 0; i < size; i++) {
 		if (histo[i] == 0)
 			continue;
 		if (histo[i] > max)
 			max = histo[i];
 		if (i > maxidx)
 			maxidx = i;
 		if (i < minidx)
 			minidx = i;
 	}
 
 	if (max < histo_width)
 		max = histo_width;
 
 	for (i = minidx; i <= maxidx; i++) {
 		(void) printf("\t\t\t%3u: %6llu %s\n",
 		    i + offset, (u_longlong_t)histo[i],
 		    &histo_stars[(max - histo[i]) * histo_width / max]);
 	}
 }
 
 static void
 dump_zap_stats(objset_t *os, uint64_t object)
 {
 	int error;
 	zap_stats_t zs;
 
 	error = zap_get_stats(os, object, &zs);
 	if (error)
 		return;
 
 	if (zs.zs_ptrtbl_len == 0) {
 		ASSERT(zs.zs_num_blocks == 1);
 		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
 		    (u_longlong_t)zs.zs_blocksize,
 		    (u_longlong_t)zs.zs_num_entries);
 		return;
 	}
 
 	(void) printf("\tFat ZAP stats:\n");
 
 	(void) printf("\t\tPointer table:\n");
 	(void) printf("\t\t\t%llu elements\n",
 	    (u_longlong_t)zs.zs_ptrtbl_len);
 	(void) printf("\t\t\tzt_blk: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_blk);
 	(void) printf("\t\t\tzt_numblks: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
 	(void) printf("\t\t\tzt_shift: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_shift);
 	(void) printf("\t\t\tzt_blks_copied: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_blks_copied);
 	(void) printf("\t\t\tzt_nextblk: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_nextblk);
 
 	(void) printf("\t\tZAP entries: %llu\n",
 	    (u_longlong_t)zs.zs_num_entries);
 	(void) printf("\t\tLeaf blocks: %llu\n",
 	    (u_longlong_t)zs.zs_num_leafs);
 	(void) printf("\t\tTotal blocks: %llu\n",
 	    (u_longlong_t)zs.zs_num_blocks);
 	(void) printf("\t\tzap_block_type: 0x%llx\n",
 	    (u_longlong_t)zs.zs_block_type);
 	(void) printf("\t\tzap_magic: 0x%llx\n",
 	    (u_longlong_t)zs.zs_magic);
 	(void) printf("\t\tzap_salt: 0x%llx\n",
 	    (u_longlong_t)zs.zs_salt);
 
 	(void) printf("\t\tLeafs with 2^n pointers:\n");
 	dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBlocks with n*5 entries:\n");
 	dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBlocks n/10 full:\n");
 	dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tEntries with n chunks:\n");
 	dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBuckets with n entries:\n");
 	dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
 }
 
 static void
 dump_none(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static void
 dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 	(void) printf("\tUNKNOWN OBJECT TYPE\n");
 }
 
 static void
 dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static void
 dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	uint64_t *arr;
 	uint64_t oursize;
 	if (dump_opt['d'] < 6)
 		return;
 
 	if (data == NULL) {
 		dmu_object_info_t doi;
 
 		VERIFY0(dmu_object_info(os, object, &doi));
 		size = doi.doi_max_offset;
 		/*
 		 * We cap the size at 1 mebibyte here to prevent
 		 * allocation failures and nigh-infinite printing if the
 		 * object is extremely large.
 		 */
 		oursize = MIN(size, 1 << 20);
 		arr = kmem_alloc(oursize, KM_SLEEP);
 
 		int err = dmu_read(os, object, 0, oursize, arr, 0);
 		if (err != 0) {
 			(void) printf("got error %u from dmu_read\n", err);
 			kmem_free(arr, oursize);
 			return;
 		}
 	} else {
 		/*
 		 * Even though the allocation is already done in this code path,
 		 * we still cap the size to prevent excessive printing.
 		 */
 		oursize = MIN(size, 1 << 20);
 		arr = data;
 	}
 
 	if (size == 0) {
 		if (data == NULL)
 			kmem_free(arr, oursize);
 		(void) printf("\t\t[]\n");
 		return;
 	}
 
 	(void) printf("\t\t[%0llx", (u_longlong_t)arr[0]);
 	for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) {
 		if (i % 4 != 0)
 			(void) printf(", %0llx", (u_longlong_t)arr[i]);
 		else
 			(void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]);
 	}
 	if (oursize != size)
 		(void) printf(", ... ");
 	(void) printf("]\n");
 
 	if (data == NULL)
 		kmem_free(arr, oursize);
 }
 
 static void
 dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 	void *prop;
 	unsigned i;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = ", attr.za_name);
 		if (attr.za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 		prop = umem_zalloc(attr.za_num_integers *
 		    attr.za_integer_length, UMEM_NOFAIL);
 		(void) zap_lookup(os, object, attr.za_name,
 		    attr.za_integer_length, attr.za_num_integers, prop);
 		if (attr.za_integer_length == 1) {
 			if (strcmp(attr.za_name,
 			    DSL_CRYPTO_KEY_MASTER_KEY) == 0 ||
 			    strcmp(attr.za_name,
 			    DSL_CRYPTO_KEY_HMAC_KEY) == 0 ||
 			    strcmp(attr.za_name, DSL_CRYPTO_KEY_IV) == 0 ||
 			    strcmp(attr.za_name, DSL_CRYPTO_KEY_MAC) == 0 ||
 			    strcmp(attr.za_name, DMU_POOL_CHECKSUM_SALT) == 0) {
 				uint8_t *u8 = prop;
 
 				for (i = 0; i < attr.za_num_integers; i++) {
 					(void) printf("%02x", u8[i]);
 				}
 			} else {
 				(void) printf("%s", (char *)prop);
 			}
 		} else {
 			for (i = 0; i < attr.za_num_integers; i++) {
 				switch (attr.za_integer_length) {
 				case 2:
 					(void) printf("%u ",
 					    ((uint16_t *)prop)[i]);
 					break;
 				case 4:
 					(void) printf("%u ",
 					    ((uint32_t *)prop)[i]);
 					break;
 				case 8:
 					(void) printf("%lld ",
 					    (u_longlong_t)((int64_t *)prop)[i]);
 					break;
 				}
 			}
 		}
 		(void) printf("\n");
 		umem_free(prop, attr.za_num_integers * attr.za_integer_length);
 	}
 	zap_cursor_fini(&zc);
 }
 
 static void
 dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	bpobj_phys_t *bpop = data;
 	uint64_t i;
 	char bytes[32], comp[32], uncomp[32];
 
 	/* make sure the output won't get truncated */
 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
 	_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
 	_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
 
 	if (bpop == NULL)
 		return;
 
 	zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));
 	zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));
 	zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));
 
 	(void) printf("\t\tnum_blkptrs = %llu\n",
 	    (u_longlong_t)bpop->bpo_num_blkptrs);
 	(void) printf("\t\tbytes = %s\n", bytes);
 	if (size >= BPOBJ_SIZE_V1) {
 		(void) printf("\t\tcomp = %s\n", comp);
 		(void) printf("\t\tuncomp = %s\n", uncomp);
 	}
 	if (size >= BPOBJ_SIZE_V2) {
 		(void) printf("\t\tsubobjs = %llu\n",
 		    (u_longlong_t)bpop->bpo_subobjs);
 		(void) printf("\t\tnum_subobjs = %llu\n",
 		    (u_longlong_t)bpop->bpo_num_subobjs);
 	}
 	if (size >= sizeof (*bpop)) {
 		(void) printf("\t\tnum_freed = %llu\n",
 		    (u_longlong_t)bpop->bpo_num_freed);
 	}
 
 	if (dump_opt['d'] < 5)
 		return;
 
 	for (i = 0; i < bpop->bpo_num_blkptrs; i++) {
 		char blkbuf[BP_SPRINTF_LEN];
 		blkptr_t bp;
 
 		int err = dmu_read(os, object,
 		    i * sizeof (bp), sizeof (bp), &bp, 0);
 		if (err != 0) {
 			(void) printf("got error %u from dmu_read\n", err);
 			break;
 		}
 		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp,
 		    BP_GET_FREE(&bp));
 		(void) printf("\t%s\n", blkbuf);
 	}
 }
 
 static void
 dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	dmu_object_info_t doi;
 	int64_t i;
 
 	VERIFY0(dmu_object_info(os, object, &doi));
 	uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
 
 	int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
 	if (err != 0) {
 		(void) printf("got error %u from dmu_read\n", err);
 		kmem_free(subobjs, doi.doi_max_offset);
 		return;
 	}
 
 	int64_t last_nonzero = -1;
 	for (i = 0; i < doi.doi_max_offset / 8; i++) {
 		if (subobjs[i] != 0)
 			last_nonzero = i;
 	}
 
 	for (i = 0; i <= last_nonzero; i++) {
 		(void) printf("\t%llu\n", (u_longlong_t)subobjs[i]);
 	}
 	kmem_free(subobjs, doi.doi_max_offset);
 }
 
 static void
 dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	dump_zap_stats(os, object);
 	/* contents are printed elsewhere, properly decoded */
 }
 
 static void
 dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = ", attr.za_name);
 		if (attr.za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 		(void) printf(" %llx : [%d:%d:%d]\n",
 		    (u_longlong_t)attr.za_first_integer,
 		    (int)ATTR_LENGTH(attr.za_first_integer),
 		    (int)ATTR_BSWAP(attr.za_first_integer),
 		    (int)ATTR_NUM(attr.za_first_integer));
 	}
 	zap_cursor_fini(&zc);
 }
 
 static void
 dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 	uint16_t *layout_attrs;
 	unsigned i;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = [", attr.za_name);
 		if (attr.za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 
 		VERIFY(attr.za_integer_length == 2);
 		layout_attrs = umem_zalloc(attr.za_num_integers *
 		    attr.za_integer_length, UMEM_NOFAIL);
 
 		VERIFY(zap_lookup(os, object, attr.za_name,
 		    attr.za_integer_length,
 		    attr.za_num_integers, layout_attrs) == 0);
 
 		for (i = 0; i != attr.za_num_integers; i++)
 			(void) printf(" %d ", (int)layout_attrs[i]);
 		(void) printf("]\n");
 		umem_free(layout_attrs,
 		    attr.za_num_integers * attr.za_integer_length);
 	}
 	zap_cursor_fini(&zc);
 }
 
 static void
 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 	const char *typenames[] = {
 		/* 0 */ "not specified",
 		/* 1 */ "FIFO",
 		/* 2 */ "Character Device",
 		/* 3 */ "3 (invalid)",
 		/* 4 */ "Directory",
 		/* 5 */ "5 (invalid)",
 		/* 6 */ "Block Device",
 		/* 7 */ "7 (invalid)",
 		/* 8 */ "Regular File",
 		/* 9 */ "9 (invalid)",
 		/* 10 */ "Symbolic Link",
 		/* 11 */ "11 (invalid)",
 		/* 12 */ "Socket",
 		/* 13 */ "Door",
 		/* 14 */ "Event Port",
 		/* 15 */ "15 (invalid)",
 	};
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = %lld (type: %s)\n",
 		    attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
 		    typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
 	}
 	zap_cursor_fini(&zc);
 }
 
 static int
 get_dtl_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		space_map_t *sm = vd->vdev_dtl_sm;
 
 		if (sm != NULL &&
 		    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
 			return (1);
 		return (0);
 	}
 
 	for (unsigned c = 0; c < vd->vdev_children; c++)
 		refcount += get_dtl_refcount(vd->vdev_child[c]);
 	return (refcount);
 }
 
 static int
 get_metaslab_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_top == vd) {
 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 			space_map_t *sm = vd->vdev_ms[m]->ms_sm;
 
 			if (sm != NULL &&
 			    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
 				refcount++;
 		}
 	}
 	for (unsigned c = 0; c < vd->vdev_children; c++)
 		refcount += get_metaslab_refcount(vd->vdev_child[c]);
 
 	return (refcount);
 }
 
 static int
 get_obsolete_refcount(vdev_t *vd)
 {
 	uint64_t obsolete_sm_object;
 	int refcount = 0;
 
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	if (vd->vdev_top == vd && obsolete_sm_object != 0) {
 		dmu_object_info_t doi;
 		VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,
 		    obsolete_sm_object, &doi));
 		if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
 			refcount++;
 		}
 	} else {
 		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
 		ASSERT3U(obsolete_sm_object, ==, 0);
 	}
 	for (unsigned c = 0; c < vd->vdev_children; c++) {
 		refcount += get_obsolete_refcount(vd->vdev_child[c]);
 	}
 
 	return (refcount);
 }
 
 static int
 get_prev_obsolete_spacemap_refcount(spa_t *spa)
 {
 	uint64_t prev_obj =
 	    spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object;
 	if (prev_obj != 0) {
 		dmu_object_info_t doi;
 		VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi));
 		if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
 			return (1);
 		}
 	}
 	return (0);
 }
 
 static int
 get_checkpoint_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
 	    zap_contains(spa_meta_objset(vd->vdev_spa),
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
 		refcount++;
 
 	for (uint64_t c = 0; c < vd->vdev_children; c++)
 		refcount += get_checkpoint_refcount(vd->vdev_child[c]);
 
 	return (refcount);
 }
 
 static int
 get_log_spacemap_refcount(spa_t *spa)
 {
 	return (avl_numnodes(&spa->spa_sm_logs_by_txg));
 }
 
 static int
 verify_spacemap_refcounts(spa_t *spa)
 {
 	uint64_t expected_refcount = 0;
 	uint64_t actual_refcount;
 
 	(void) feature_get_refcount(spa,
 	    &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
 	    &expected_refcount);
 	actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
 	actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
 	actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
 	actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
 	actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
 	actual_refcount += get_log_spacemap_refcount(spa);
 
 	if (expected_refcount != actual_refcount) {
 		(void) printf("space map refcount mismatch: expected %lld != "
 		    "actual %lld\n",
 		    (longlong_t)expected_refcount,
 		    (longlong_t)actual_refcount);
 		return (2);
 	}
 	return (0);
 }
 
 static void
 dump_spacemap(objset_t *os, space_map_t *sm)
 {
 	const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
 	    "INVALID", "INVALID", "INVALID", "INVALID" };
 
 	if (sm == NULL)
 		return;
 
 	(void) printf("space map object %llu:\n",
 	    (longlong_t)sm->sm_object);
 	(void) printf("  smp_length = 0x%llx\n",
 	    (longlong_t)sm->sm_phys->smp_length);
 	(void) printf("  smp_alloc = 0x%llx\n",
 	    (longlong_t)sm->sm_phys->smp_alloc);
 
 	if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
 		return;
 
 	/*
 	 * Print out the freelist entries in both encoded and decoded form.
 	 */
 	uint8_t mapshift = sm->sm_shift;
 	int64_t alloc = 0;
 	uint64_t word, entry_id = 0;
 	for (uint64_t offset = 0; offset < space_map_length(sm);
 	    offset += sizeof (word)) {
 
 		VERIFY0(dmu_read(os, space_map_object(sm), offset,
 		    sizeof (word), &word, DMU_READ_PREFETCH));
 
 		if (sm_entry_is_debug(word)) {
 			uint64_t de_txg = SM_DEBUG_TXG_DECODE(word);
 			uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word);
 			if (de_txg == 0) {
 				(void) printf(
 				    "\t    [%6llu] PADDING\n",
 				    (u_longlong_t)entry_id);
 			} else {
 				(void) printf(
 				    "\t    [%6llu] %s: txg %llu pass %llu\n",
 				    (u_longlong_t)entry_id,
 				    ddata[SM_DEBUG_ACTION_DECODE(word)],
 				    (u_longlong_t)de_txg,
 				    (u_longlong_t)de_sync_pass);
 			}
 			entry_id++;
 			continue;
 		}
 
 		uint8_t words;
 		char entry_type;
 		uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
 
 		if (sm_entry_is_single_word(word)) {
 			entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
 			    'A' : 'F';
 			entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
 			    sm->sm_start;
 			entry_run = SM_RUN_DECODE(word) << mapshift;
 			words = 1;
 		} else {
 			/* it is a two-word entry so we read another word */
 			ASSERT(sm_entry_is_double_word(word));
 
 			uint64_t extra_word;
 			offset += sizeof (extra_word);
 			VERIFY0(dmu_read(os, space_map_object(sm), offset,
 			    sizeof (extra_word), &extra_word,
 			    DMU_READ_PREFETCH));
 
 			ASSERT3U(offset, <=, space_map_length(sm));
 
 			entry_run = SM2_RUN_DECODE(word) << mapshift;
 			entry_vdev = SM2_VDEV_DECODE(word);
 			entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
 			    'A' : 'F';
 			entry_off = (SM2_OFFSET_DECODE(extra_word) <<
 			    mapshift) + sm->sm_start;
 			words = 2;
 		}
 
 		(void) printf("\t    [%6llu]    %c  range:"
 		    " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n",
 		    (u_longlong_t)entry_id,
 		    entry_type, (u_longlong_t)entry_off,
 		    (u_longlong_t)(entry_off + entry_run),
 		    (u_longlong_t)entry_run,
 		    (u_longlong_t)entry_vdev, words);
 
 		if (entry_type == 'A')
 			alloc += entry_run;
 		else
 			alloc -= entry_run;
 		entry_id++;
 	}
 	if (alloc != space_map_allocated(sm)) {
 		(void) printf("space_map_object alloc (%lld) INCONSISTENT "
 		    "with space map summary (%lld)\n",
 		    (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
 	}
 }
 
 static void
 dump_metaslab_stats(metaslab_t *msp)
 {
 	char maxbuf[32];
 	range_tree_t *rt = msp->ms_allocatable;
 	zfs_btree_t *t = &msp->ms_allocatable_by_size;
 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
 	/* max sure nicenum has enough space */
 	_Static_assert(sizeof (maxbuf) >= NN_NUMBUF_SZ, "maxbuf truncated");
 
 	zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));
 
 	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
 	    "segments", zfs_btree_numnodes(t), "maxsize", maxbuf,
 	    "freepct", free_pct);
 	(void) printf("\tIn-memory histogram:\n");
 	dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
 }
 
 static void
 dump_metaslab(metaslab_t *msp)
 {
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	space_map_t *sm = msp->ms_sm;
 	char freebuf[32];
 
 	zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,
 	    sizeof (freebuf));
 
 	(void) printf(
 	    "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
 	    (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
 	    (u_longlong_t)space_map_object(sm), freebuf);
 
 	if (dump_opt['m'] > 2 && !dump_opt['L']) {
 		mutex_enter(&msp->ms_lock);
 		VERIFY0(metaslab_load(msp));
 		range_tree_stat_verify(msp->ms_allocatable);
 		dump_metaslab_stats(msp);
 		metaslab_unload(msp);
 		mutex_exit(&msp->ms_lock);
 	}
 
 	if (dump_opt['m'] > 1 && sm != NULL &&
 	    spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
 		/*
 		 * The space map histogram represents free space in chunks
 		 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
 		 */
 		(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
 		    (u_longlong_t)msp->ms_fragmentation);
 		dump_histogram(sm->sm_phys->smp_histogram,
 		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
 	}
 
 	if (vd->vdev_ops == &vdev_draid_ops)
 		ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift);
 	else
 		ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift);
 
 	dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
 		(void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n",
 		    (u_longlong_t)metaslab_unflushed_txg(msp));
 	}
 }
 
 static void
 print_vdev_metaslab_header(vdev_t *vd)
 {
 	vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
 	const char *bias_str = "";
 	if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) {
 		bias_str = VDEV_ALLOC_BIAS_LOG;
 	} else if (alloc_bias == VDEV_BIAS_SPECIAL) {
 		bias_str = VDEV_ALLOC_BIAS_SPECIAL;
 	} else if (alloc_bias == VDEV_BIAS_DEDUP) {
 		bias_str = VDEV_ALLOC_BIAS_DEDUP;
 	}
 
 	uint64_t ms_flush_data_obj = 0;
 	if (vd->vdev_top_zap != 0) {
 		int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
 		    sizeof (uint64_t), 1, &ms_flush_data_obj);
 		if (error != ENOENT) {
 			ASSERT0(error);
 		}
 	}
 
 	(void) printf("\tvdev %10llu   %s",
 	    (u_longlong_t)vd->vdev_id, bias_str);
 
 	if (ms_flush_data_obj != 0) {
 		(void) printf("   ms_unflushed_phys object %llu",
 		    (u_longlong_t)ms_flush_data_obj);
 	}
 
 	(void) printf("\n\t%-10s%5llu   %-19s   %-15s   %-12s\n",
 	    "metaslabs", (u_longlong_t)vd->vdev_ms_count,
 	    "offset", "spacemap", "free");
 	(void) printf("\t%15s   %19s   %15s   %12s\n",
 	    "---------------", "-------------------",
 	    "---------------", "------------");
 }
 
 static void
 dump_metaslab_groups(spa_t *spa, boolean_t show_special)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	metaslab_class_t *mc = spa_normal_class(spa);
 	metaslab_class_t *smc = spa_special_class(spa);
 	uint64_t fragmentation;
 
 	metaslab_class_histogram_verify(mc);
 
 	for (unsigned c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (mg == NULL || (mg->mg_class != mc &&
 		    (!show_special || mg->mg_class != smc)))
 			continue;
 
 		metaslab_group_histogram_verify(mg);
 		mg->mg_fragmentation = metaslab_group_fragmentation(mg);
 
 		(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
 		    "fragmentation",
 		    (u_longlong_t)tvd->vdev_id,
 		    (u_longlong_t)tvd->vdev_ms_count);
 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 			(void) printf("%3s\n", "-");
 		} else {
 			(void) printf("%3llu%%\n",
 			    (u_longlong_t)mg->mg_fragmentation);
 		}
 		dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
 	}
 
 	(void) printf("\tpool %s\tfragmentation", spa_name(spa));
 	fragmentation = metaslab_class_fragmentation(mc);
 	if (fragmentation == ZFS_FRAG_INVALID)
 		(void) printf("\t%3s\n", "-");
 	else
 		(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
 	dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
 }
 
 static void
 print_vdev_indirect(vdev_t *vd)
 {
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	vdev_indirect_births_t *vib = vd->vdev_indirect_births;
 
 	if (vim == NULL) {
 		ASSERT3P(vib, ==, NULL);
 		return;
 	}
 
 	ASSERT3U(vdev_indirect_mapping_object(vim), ==,
 	    vic->vic_mapping_object);
 	ASSERT3U(vdev_indirect_births_object(vib), ==,
 	    vic->vic_births_object);
 
 	(void) printf("indirect births obj %llu:\n",
 	    (longlong_t)vic->vic_births_object);
 	(void) printf("    vib_count = %llu\n",
 	    (longlong_t)vdev_indirect_births_count(vib));
 	for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) {
 		vdev_indirect_birth_entry_phys_t *cur_vibe =
 		    &vib->vib_entries[i];
 		(void) printf("\toffset %llx -> txg %llu\n",
 		    (longlong_t)cur_vibe->vibe_offset,
 		    (longlong_t)cur_vibe->vibe_phys_birth_txg);
 	}
 	(void) printf("\n");
 
 	(void) printf("indirect mapping obj %llu:\n",
 	    (longlong_t)vic->vic_mapping_object);
 	(void) printf("    vim_max_offset = 0x%llx\n",
 	    (longlong_t)vdev_indirect_mapping_max_offset(vim));
 	(void) printf("    vim_bytes_mapped = 0x%llx\n",
 	    (longlong_t)vdev_indirect_mapping_bytes_mapped(vim));
 	(void) printf("    vim_count = %llu\n",
 	    (longlong_t)vdev_indirect_mapping_num_entries(vim));
 
 	if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3)
 		return;
 
 	uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim);
 
 	for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
 		vdev_indirect_mapping_entry_phys_t *vimep =
 		    &vim->vim_entries[i];
 		(void) printf("\t<%llx:%llx:%llx> -> "
 		    "<%llx:%llx:%llx> (%x obsolete)\n",
 		    (longlong_t)vd->vdev_id,
 		    (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
 		    (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
 		    (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst),
 		    (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst),
 		    (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
 		    counts[i]);
 	}
 	(void) printf("\n");
 
 	uint64_t obsolete_sm_object;
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	if (obsolete_sm_object != 0) {
 		objset_t *mos = vd->vdev_spa->spa_meta_objset;
 		(void) printf("obsolete space map object %llu:\n",
 		    (u_longlong_t)obsolete_sm_object);
 		ASSERT(vd->vdev_obsolete_sm != NULL);
 		ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==,
 		    obsolete_sm_object);
 		dump_spacemap(mos, vd->vdev_obsolete_sm);
 		(void) printf("\n");
 	}
 }
 
 static void
 dump_metaslabs(spa_t *spa)
 {
 	vdev_t *vd, *rvd = spa->spa_root_vdev;
 	uint64_t m, c = 0, children = rvd->vdev_children;
 
 	(void) printf("\nMetaslabs:\n");
 
 	if (!dump_opt['d'] && zopt_metaslab_args > 0) {
 		c = zopt_metaslab[0];
 
 		if (c >= children)
 			(void) fatal("bad vdev id: %llu", (u_longlong_t)c);
 
 		if (zopt_metaslab_args > 1) {
 			vd = rvd->vdev_child[c];
 			print_vdev_metaslab_header(vd);
 
 			for (m = 1; m < zopt_metaslab_args; m++) {
 				if (zopt_metaslab[m] < vd->vdev_ms_count)
 					dump_metaslab(
 					    vd->vdev_ms[zopt_metaslab[m]]);
 				else
 					(void) fprintf(stderr, "bad metaslab "
 					    "number %llu\n",
 					    (u_longlong_t)zopt_metaslab[m]);
 			}
 			(void) printf("\n");
 			return;
 		}
 		children = c + 1;
 	}
 	for (; c < children; c++) {
 		vd = rvd->vdev_child[c];
 		print_vdev_metaslab_header(vd);
 
 		print_vdev_indirect(vd);
 
 		for (m = 0; m < vd->vdev_ms_count; m++)
 			dump_metaslab(vd->vdev_ms[m]);
 		(void) printf("\n");
 	}
 }
 
 static void
 dump_log_spacemaps(spa_t *spa)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	(void) printf("\nLog Space Maps in Pool:\n");
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
 		space_map_t *sm = NULL;
 		VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
 		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
 
 		(void) printf("Log Spacemap object %llu txg %llu\n",
 		    (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg);
 		dump_spacemap(spa->spa_meta_objset, sm);
 		space_map_close(sm);
 	}
 	(void) printf("\n");
 }
 
 static void
 dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
 {
 	const ddt_phys_t *ddp = dde->dde_phys;
 	const ddt_key_t *ddk = &dde->dde_key;
 	const char *types[4] = { "ditto", "single", "double", "triple" };
 	char blkbuf[BP_SPRINTF_LEN];
 	blkptr_t blk;
 	int p;
 
 	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 		if (ddp->ddp_phys_birth == 0)
 			continue;
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
 		(void) printf("index %llx refcnt %llu %s %s\n",
 		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
 		    types[p], blkbuf);
 	}
 }
 
 static void
 dump_dedup_ratio(const ddt_stat_t *dds)
 {
 	double rL, rP, rD, D, dedup, compress, copies;
 
 	if (dds->dds_blocks == 0)
 		return;
 
 	rL = (double)dds->dds_ref_lsize;
 	rP = (double)dds->dds_ref_psize;
 	rD = (double)dds->dds_ref_dsize;
 	D = (double)dds->dds_dsize;
 
 	dedup = rD / D;
 	compress = rL / rP;
 	copies = rD / rP;
 
 	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
 	    "dedup * compress / copies = %.2f\n\n",
 	    dedup, compress, copies, dedup * compress / copies);
 }
 
 static void
 dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
 	char name[DDT_NAMELEN];
 	ddt_entry_t dde;
 	uint64_t walk = 0;
 	dmu_object_info_t doi;
 	uint64_t count, dspace, mspace;
 	int error;
 
 	error = ddt_object_info(ddt, type, class, &doi);
 
 	if (error == ENOENT)
 		return;
 	ASSERT(error == 0);
 
 	error = ddt_object_count(ddt, type, class, &count);
 	ASSERT(error == 0);
 	if (count == 0)
 		return;
 
 	dspace = doi.doi_physical_blocks_512 << 9;
 	mspace = doi.doi_fill_count * doi.doi_data_block_size;
 
 	ddt_object_name(ddt, type, class, name);
 
 	(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
 	    name,
 	    (u_longlong_t)count,
 	    (u_longlong_t)(dspace / count),
 	    (u_longlong_t)(mspace / count));
 
 	if (dump_opt['D'] < 3)
 		return;
 
 	zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
 
 	if (dump_opt['D'] < 4)
 		return;
 
 	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
 		return;
 
 	(void) printf("%s contents:\n\n", name);
 
 	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
 		dump_dde(ddt, &dde, walk);
 
 	ASSERT3U(error, ==, ENOENT);
 
 	(void) printf("\n");
 }
 
 static void
 dump_all_ddts(spa_t *spa)
 {
 	ddt_histogram_t ddh_total = {{{0}}};
 	ddt_stat_t dds_total = {0};
 
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (!ddt)
 			continue;
 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 			for (ddt_class_t class = 0; class < DDT_CLASSES;
 			    class++) {
 				dump_ddt(ddt, type, class);
 			}
 		}
 	}
 
 	ddt_get_dedup_stats(spa, &dds_total);
 
 	if (dds_total.dds_blocks == 0) {
 		(void) printf("All DDTs are empty\n");
 		return;
 	}
 
 	(void) printf("\n");
 
 	if (dump_opt['D'] > 1) {
 		(void) printf("DDT histogram (aggregated over all DDTs):\n");
 		ddt_get_dedup_histogram(spa, &ddh_total);
 		zpool_dump_ddt(&dds_total, &ddh_total);
 	}
 
 	dump_dedup_ratio(&dds_total);
 }
 
 static void
 dump_brt(spa_t *spa)
 {
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING)) {
 		printf("BRT: unsupported on this pool\n");
 		return;
 	}
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
 		printf("BRT: empty\n");
 		return;
 	}
 
 	brt_t *brt = spa->spa_brt;
 	VERIFY(brt);
 
 	char count[32], used[32], saved[32];
 	zdb_nicebytes(brt_get_used(spa), used, sizeof (used));
 	zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved));
 	uint64_t ratio = brt_get_ratio(spa);
 	printf("BRT: used %s; saved %s; ratio %llu.%02llux\n", used, saved,
 	    (u_longlong_t)(ratio / 100), (u_longlong_t)(ratio % 100));
 
 	if (dump_opt['T'] < 2)
 		return;
 
 	for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
 		if (brtvd == NULL)
 			continue;
 
 		if (!brtvd->bv_initiated) {
 			printf("BRT: vdev %" PRIu64 ": empty\n", vdevid);
 			continue;
 		}
 
 		zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count));
 		zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used));
 		zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved));
 		printf("BRT: vdev %" PRIu64 ": refcnt %s; used %s; saved %s\n",
 		    vdevid, count, used, saved);
 	}
 
 	if (dump_opt['T'] < 3)
 		return;
 
 	char dva[64];
 	printf("\n%-16s %-10s\n", "DVA", "REFCNT");
 
 	for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
 		if (brtvd == NULL || !brtvd->bv_initiated)
 			continue;
 
 		zap_cursor_t zc;
 		zap_attribute_t za;
 		for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t offset = *(uint64_t *)za.za_name;
 			uint64_t refcnt = za.za_first_integer;
 
 			snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx", vdevid,
 			    (u_longlong_t)offset);
 			printf("%-16s %-10llu\n", dva, (u_longlong_t)refcnt);
 		}
 		zap_cursor_fini(&zc);
 	}
 }
 
 static void
 dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
 {
 	char *prefix = arg;
 
 	(void) printf("%s [%llu,%llu) length %llu\n",
 	    prefix,
 	    (u_longlong_t)start,
 	    (u_longlong_t)(start + size),
 	    (u_longlong_t)(size));
 }
 
 static void
 dump_dtl(vdev_t *vd, int indent)
 {
 	spa_t *spa = vd->vdev_spa;
 	boolean_t required;
 	const char *name[DTL_TYPES] = { "missing", "partial", "scrub",
 		"outage" };
 	char prefix[256];
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 	required = vdev_dtl_required(vd);
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	if (indent == 0)
 		(void) printf("\nDirty time logs:\n\n");
 
 	(void) printf("\t%*s%s [%s]\n", indent, "",
 	    vd->vdev_path ? vd->vdev_path :
 	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
 	    required ? "DTL-required" : "DTL-expendable");
 
 	for (int t = 0; t < DTL_TYPES; t++) {
 		range_tree_t *rt = vd->vdev_dtl[t];
 		if (range_tree_space(rt) == 0)
 			continue;
 		(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
 		    indent + 2, "", name[t]);
 		range_tree_walk(rt, dump_dtl_seg, prefix);
 		if (dump_opt['d'] > 5 && vd->vdev_children == 0)
 			dump_spacemap(spa->spa_meta_objset,
 			    vd->vdev_dtl_sm);
 	}
 
 	for (unsigned c = 0; c < vd->vdev_children; c++)
 		dump_dtl(vd->vdev_child[c], indent + 4);
 }
 
 static void
 dump_history(spa_t *spa)
 {
 	nvlist_t **events = NULL;
 	char *buf;
 	uint64_t resid, len, off = 0;
 	uint_t num = 0;
 	int error;
 	char tbuf[30];
 
 	if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) {
 		(void) fprintf(stderr, "%s: unable to allocate I/O buffer\n",
 		    __func__);
 		return;
 	}
 
 	do {
 		len = SPA_OLD_MAXBLOCKSIZE;
 
 		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
 			(void) fprintf(stderr, "Unable to read history: "
 			    "error %d\n", error);
 			free(buf);
 			return;
 		}
 
 		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
 			break;
 
 		off -= resid;
 	} while (len != 0);
 
 	(void) printf("\nHistory:\n");
 	for (unsigned i = 0; i < num; i++) {
 		boolean_t printed = B_FALSE;
 
 		if (nvlist_exists(events[i], ZPOOL_HIST_TIME)) {
 			time_t tsec;
 			struct tm t;
 
 			tsec = fnvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_TIME);
 			(void) localtime_r(&tsec, &t);
 			(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
 		} else {
 			tbuf[0] = '\0';
 		}
 
 		if (nvlist_exists(events[i], ZPOOL_HIST_CMD)) {
 			(void) printf("%s %s\n", tbuf,
 			    fnvlist_lookup_string(events[i], ZPOOL_HIST_CMD));
 		} else if (nvlist_exists(events[i], ZPOOL_HIST_INT_EVENT)) {
 			uint64_t ievent;
 
 			ievent = fnvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_INT_EVENT);
 			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
 				goto next;
 
 			(void) printf(" %s [internal %s txg:%ju] %s\n",
 			    tbuf,
 			    zfs_history_event_names[ievent],
 			    fnvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_TXG),
 			    fnvlist_lookup_string(events[i],
 			    ZPOOL_HIST_INT_STR));
 		} else if (nvlist_exists(events[i], ZPOOL_HIST_INT_NAME)) {
 			(void) printf("%s [txg:%ju] %s", tbuf,
 			    fnvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_TXG),
 			    fnvlist_lookup_string(events[i],
 			    ZPOOL_HIST_INT_NAME));
 
 			if (nvlist_exists(events[i], ZPOOL_HIST_DSNAME)) {
 				(void) printf(" %s (%llu)",
 				    fnvlist_lookup_string(events[i],
 				    ZPOOL_HIST_DSNAME),
 				    (u_longlong_t)fnvlist_lookup_uint64(
 				    events[i],
 				    ZPOOL_HIST_DSID));
 			}
 
 			(void) printf(" %s\n", fnvlist_lookup_string(events[i],
 			    ZPOOL_HIST_INT_STR));
 		} else if (nvlist_exists(events[i], ZPOOL_HIST_IOCTL)) {
 			(void) printf("%s ioctl %s\n", tbuf,
 			    fnvlist_lookup_string(events[i],
 			    ZPOOL_HIST_IOCTL));
 
 			if (nvlist_exists(events[i], ZPOOL_HIST_INPUT_NVL)) {
 				(void) printf("    input:\n");
 				dump_nvlist(fnvlist_lookup_nvlist(events[i],
 				    ZPOOL_HIST_INPUT_NVL), 8);
 			}
 			if (nvlist_exists(events[i], ZPOOL_HIST_OUTPUT_NVL)) {
 				(void) printf("    output:\n");
 				dump_nvlist(fnvlist_lookup_nvlist(events[i],
 				    ZPOOL_HIST_OUTPUT_NVL), 8);
 			}
 			if (nvlist_exists(events[i], ZPOOL_HIST_ERRNO)) {
 				(void) printf("    errno: %lld\n",
 				    (longlong_t)fnvlist_lookup_int64(events[i],
 				    ZPOOL_HIST_ERRNO));
 			}
 		} else {
 			goto next;
 		}
 
 		printed = B_TRUE;
 next:
 		if (dump_opt['h'] > 1) {
 			if (!printed)
 				(void) printf("unrecognized record:\n");
 			dump_nvlist(events[i], 2);
 		}
 	}
 	free(buf);
 }
 
 static void
 dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static uint64_t
 blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
     const zbookmark_phys_t *zb)
 {
 	if (dnp == NULL) {
 		ASSERT(zb->zb_level < 0);
 		if (zb->zb_object == 0)
 			return (zb->zb_blkid);
 		return (zb->zb_blkid * BP_GET_LSIZE(bp));
 	}
 
 	ASSERT(zb->zb_level >= 0);
 
 	return ((zb->zb_blkid <<
 	    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 }
 
 static void
 snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen,
     const blkptr_t *bp)
 {
 	static abd_t *pabd = NULL;
 	void *buf;
 	zio_t *zio;
 	zfs_zstdhdr_t zstd_hdr;
 	int error;
 
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD)
 		return;
 
 	if (BP_IS_HOLE(bp))
 		return;
 
 	if (BP_IS_EMBEDDED(bp)) {
 		buf = malloc(SPA_MAXBLOCKSIZE);
 		if (buf == NULL) {
 			(void) fprintf(stderr, "out of memory\n");
 			exit(1);
 		}
 		decode_embedded_bp_compressed(bp, buf);
 		memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
 		free(buf);
 		zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
 		zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    " ZSTD:size=%u:version=%u:level=%u:EMBEDDED",
 		    zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),
 		    zfs_get_hdrlevel(&zstd_hdr));
 		return;
 	}
 
 	if (!pabd)
 		pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
 	zio = zio_root(spa, NULL, NULL, 0);
 
 	/* Decrypt but don't decompress so we can read the compression header */
 	zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL,
 	    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS,
 	    NULL));
 	error = zio_wait(zio);
 	if (error) {
 		(void) fprintf(stderr, "read failed: %d\n", error);
 		return;
 	}
 	buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp));
 	memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
 	zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
 	zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
 
 	(void) snprintf(blkbuf + strlen(blkbuf),
 	    buflen - strlen(blkbuf),
 	    " ZSTD:size=%u:version=%u:level=%u:NORMAL",
 	    zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),
 	    zfs_get_hdrlevel(&zstd_hdr));
 
 	abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp));
 }
 
 static void
 snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
     boolean_t bp_freed)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
 	int i;
 
 	if (dump_opt['b'] >= 6) {
 		snprintf_blkptr(blkbuf, buflen, bp);
 		if (bp_freed) {
 			(void) snprintf(blkbuf + strlen(blkbuf),
 			    buflen - strlen(blkbuf), " %s", "FREE");
 		}
 		return;
 	}
 
 	if (BP_IS_EMBEDDED(bp)) {
 		(void) sprintf(blkbuf,
 		    "EMBEDDED et=%u %llxL/%llxP B=%llu",
 		    (int)BPE_GET_ETYPE(bp),
 		    (u_longlong_t)BPE_GET_LSIZE(bp),
 		    (u_longlong_t)BPE_GET_PSIZE(bp),
-		    (u_longlong_t)bp->blk_birth);
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
 		return;
 	}
 
 	blkbuf[0] = '\0';
 
 	for (i = 0; i < ndvas; i++)
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf), "%llu:%llx:%llx ",
 		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
 		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
 		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
 
 	if (BP_IS_HOLE(bp)) {
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    "%llxL B=%llu",
 		    (u_longlong_t)BP_GET_LSIZE(bp),
-		    (u_longlong_t)bp->blk_birth);
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
 	} else {
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    "%llxL/%llxP F=%llu B=%llu/%llu",
 		    (u_longlong_t)BP_GET_LSIZE(bp),
 		    (u_longlong_t)BP_GET_PSIZE(bp),
 		    (u_longlong_t)BP_GET_FILL(bp),
-		    (u_longlong_t)bp->blk_birth,
-		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp),
+		    (u_longlong_t)BP_GET_BIRTH(bp));
 		if (bp_freed)
 			(void) snprintf(blkbuf + strlen(blkbuf),
 			    buflen - strlen(blkbuf), " %s", "FREE");
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    " cksum=%016llx:%016llx:%016llx:%016llx",
 		    (u_longlong_t)bp->blk_cksum.zc_word[0],
 		    (u_longlong_t)bp->blk_cksum.zc_word[1],
 		    (u_longlong_t)bp->blk_cksum.zc_word[2],
 		    (u_longlong_t)bp->blk_cksum.zc_word[3]);
 	}
 }
 
 static void
 print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb,
     const dnode_phys_t *dnp)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 	int l;
 
 	if (!BP_IS_EMBEDDED(bp)) {
 		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
 		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
 	}
 
 	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
 
 	ASSERT(zb->zb_level >= 0);
 
 	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
 		if (l == zb->zb_level) {
 			(void) printf("L%llx", (u_longlong_t)zb->zb_level);
 		} else {
 			(void) printf(" ");
 		}
 	}
 
 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE);
 	if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD)
 		snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s\n", blkbuf);
 }
 
 static int
 visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
     blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	int err = 0;
 
-	if (bp->blk_birth == 0)
+	if (BP_GET_LOGICAL_BIRTH(bp) == 0)
 		return (0);
 
 	print_indirect(spa, bp, zb, dnp);
 
 	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		arc_buf_t *buf;
 		uint64_t fill = 0;
 		ASSERT(!BP_IS_REDACTED(bp));
 
 		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err)
 			return (err);
 		ASSERT(buf->b_data);
 
 		/* recursively visit blocks below this */
 		cbp = buf->b_data;
 		for (i = 0; i < epb; i++, cbp++) {
 			zbookmark_phys_t czb;
 
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			err = visit_indirect(spa, dnp, cbp, &czb);
 			if (err)
 				break;
 			fill += BP_GET_FILL(cbp);
 		}
 		if (!err)
 			ASSERT3U(fill, ==, BP_GET_FILL(bp));
 		arc_buf_destroy(buf, &buf);
 	}
 
 	return (err);
 }
 
 static void
 dump_indirect(dnode_t *dn)
 {
 	dnode_phys_t *dnp = dn->dn_phys;
 	zbookmark_phys_t czb;
 
 	(void) printf("Indirect blocks:\n");
 
 	SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
 	    dn->dn_object, dnp->dn_nlevels - 1, 0);
 	for (int j = 0; j < dnp->dn_nblkptr; j++) {
 		czb.zb_blkid = j;
 		(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
 		    &dnp->dn_blkptr[j], &czb);
 	}
 
 	(void) printf("\n");
 }
 
 static void
 dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object;
 	dsl_dir_phys_t *dd = data;
 	time_t crtime;
 	char nice[32];
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (nice) >= NN_NUMBUF_SZ, "nice truncated");
 
 	if (dd == NULL)
 		return;
 
 	ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
 
 	crtime = dd->dd_creation_time;
 	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
 	(void) printf("\t\thead_dataset_obj = %llu\n",
 	    (u_longlong_t)dd->dd_head_dataset_obj);
 	(void) printf("\t\tparent_dir_obj = %llu\n",
 	    (u_longlong_t)dd->dd_parent_obj);
 	(void) printf("\t\torigin_obj = %llu\n",
 	    (u_longlong_t)dd->dd_origin_obj);
 	(void) printf("\t\tchild_dir_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_child_dir_zapobj);
 	zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));
 	(void) printf("\t\tused_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));
 	(void) printf("\t\tcompressed_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));
 	(void) printf("\t\tuncompressed_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_quota, nice, sizeof (nice));
 	(void) printf("\t\tquota = %s\n", nice);
 	zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));
 	(void) printf("\t\treserved = %s\n", nice);
 	(void) printf("\t\tprops_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_props_zapobj);
 	(void) printf("\t\tdeleg_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_deleg_zapobj);
 	(void) printf("\t\tflags = %llx\n",
 	    (u_longlong_t)dd->dd_flags);
 
 #define	DO(which) \
 	zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \
 	    sizeof (nice)); \
 	(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
 	DO(HEAD);
 	DO(SNAP);
 	DO(CHILD);
 	DO(CHILD_RSRV);
 	DO(REFRSRV);
 #undef DO
 	(void) printf("\t\tclones = %llu\n",
 	    (u_longlong_t)dd->dd_clones);
 }
 
 static void
 dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object;
 	dsl_dataset_phys_t *ds = data;
 	time_t crtime;
 	char used[32], compressed[32], uncompressed[32], unique[32];
 	char blkbuf[BP_SPRINTF_LEN];
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (used) >= NN_NUMBUF_SZ, "used truncated");
 	_Static_assert(sizeof (compressed) >= NN_NUMBUF_SZ,
 	    "compressed truncated");
 	_Static_assert(sizeof (uncompressed) >= NN_NUMBUF_SZ,
 	    "uncompressed truncated");
 	_Static_assert(sizeof (unique) >= NN_NUMBUF_SZ, "unique truncated");
 
 	if (ds == NULL)
 		return;
 
 	ASSERT(size == sizeof (*ds));
 	crtime = ds->ds_creation_time;
 	zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));
 	zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));
 	zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,
 	    sizeof (uncompressed));
 	zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
 
 	(void) printf("\t\tdir_obj = %llu\n",
 	    (u_longlong_t)ds->ds_dir_obj);
 	(void) printf("\t\tprev_snap_obj = %llu\n",
 	    (u_longlong_t)ds->ds_prev_snap_obj);
 	(void) printf("\t\tprev_snap_txg = %llu\n",
 	    (u_longlong_t)ds->ds_prev_snap_txg);
 	(void) printf("\t\tnext_snap_obj = %llu\n",
 	    (u_longlong_t)ds->ds_next_snap_obj);
 	(void) printf("\t\tsnapnames_zapobj = %llu\n",
 	    (u_longlong_t)ds->ds_snapnames_zapobj);
 	(void) printf("\t\tnum_children = %llu\n",
 	    (u_longlong_t)ds->ds_num_children);
 	(void) printf("\t\tuserrefs_obj = %llu\n",
 	    (u_longlong_t)ds->ds_userrefs_obj);
 	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
 	(void) printf("\t\tcreation_txg = %llu\n",
 	    (u_longlong_t)ds->ds_creation_txg);
 	(void) printf("\t\tdeadlist_obj = %llu\n",
 	    (u_longlong_t)ds->ds_deadlist_obj);
 	(void) printf("\t\tused_bytes = %s\n", used);
 	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
 	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
 	(void) printf("\t\tunique = %s\n", unique);
 	(void) printf("\t\tfsid_guid = %llu\n",
 	    (u_longlong_t)ds->ds_fsid_guid);
 	(void) printf("\t\tguid = %llu\n",
 	    (u_longlong_t)ds->ds_guid);
 	(void) printf("\t\tflags = %llx\n",
 	    (u_longlong_t)ds->ds_flags);
 	(void) printf("\t\tnext_clones_obj = %llu\n",
 	    (u_longlong_t)ds->ds_next_clones_obj);
 	(void) printf("\t\tprops_obj = %llu\n",
 	    (u_longlong_t)ds->ds_props_obj);
 	(void) printf("\t\tbp = %s\n", blkbuf);
 }
 
 static int
 dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	(void) arg, (void) tx;
 	char blkbuf[BP_SPRINTF_LEN];
 
-	if (bp->blk_birth != 0) {
+	if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("\t%s\n", blkbuf);
 	}
 	return (0);
 }
 
 static void
 dump_bptree(objset_t *os, uint64_t obj, const char *name)
 {
 	char bytes[32];
 	bptree_phys_t *bt;
 	dmu_buf_t *db;
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
 	bt = db->db_data;
 	zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));
 	(void) printf("\n    %s: %llu datasets, %s\n",
 	    name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
 	dmu_buf_rele(db, FTAG);
 
 	if (dump_opt['d'] < 5)
 		return;
 
 	(void) printf("\n");
 
 	(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
 }
 
 static int
 dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 {
 	(void) arg, (void) tx;
 	char blkbuf[BP_SPRINTF_LEN];
 
-	ASSERT(bp->blk_birth != 0);
+	ASSERT(BP_GET_LOGICAL_BIRTH(bp) != 0);
 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
 	(void) printf("\t%s\n", blkbuf);
 	return (0);
 }
 
 static void
 dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
 {
 	char bytes[32];
 	char comp[32];
 	char uncomp[32];
 	uint64_t i;
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
 	_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
 	_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));
 	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
 		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
 		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
 		if (bpo->bpo_havefreed) {
 			(void) printf("    %*s: object %llu, %llu local "
 			    "blkptrs, %llu freed, %llu subobjs in object %llu, "
 			    "%s (%s/%s comp)\n",
 			    indent * 8, name,
 			    (u_longlong_t)bpo->bpo_object,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
 			    bytes, comp, uncomp);
 		} else {
 			(void) printf("    %*s: object %llu, %llu local "
 			    "blkptrs, %llu subobjs in object %llu, "
 			    "%s (%s/%s comp)\n",
 			    indent * 8, name,
 			    (u_longlong_t)bpo->bpo_object,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
 			    bytes, comp, uncomp);
 		}
 
 		for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
 			uint64_t subobj;
 			bpobj_t subbpo;
 			int error;
 			VERIFY0(dmu_read(bpo->bpo_os,
 			    bpo->bpo_phys->bpo_subobjs,
 			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
 			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
 			if (error != 0) {
 				(void) printf("ERROR %u while trying to open "
 				    "subobj id %llu\n",
 				    error, (u_longlong_t)subobj);
 				continue;
 			}
 			dump_full_bpobj(&subbpo, "subobj", indent + 1);
 			bpobj_close(&subbpo);
 		}
 	} else {
 		if (bpo->bpo_havefreed) {
 			(void) printf("    %*s: object %llu, %llu blkptrs, "
 			    "%llu freed, %s\n",
 			    indent * 8, name,
 			    (u_longlong_t)bpo->bpo_object,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
 			    bytes);
 		} else {
 			(void) printf("    %*s: object %llu, %llu blkptrs, "
 			    "%s\n",
 			    indent * 8, name,
 			    (u_longlong_t)bpo->bpo_object,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 			    bytes);
 		}
 	}
 
 	if (dump_opt['d'] < 5)
 		return;
 
 
 	if (indent == 0) {
 		(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
 		(void) printf("\n");
 	}
 }
 
 static int
 dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact,
     boolean_t print_list)
 {
 	int err = 0;
 	zfs_bookmark_phys_t prop;
 	objset_t *mos = dp->dp_spa->spa_meta_objset;
 	err = dsl_bookmark_lookup(dp, name, NULL, &prop);
 
 	if (err != 0) {
 		return (err);
 	}
 
 	(void) printf("\t#%s: ", strchr(name, '#') + 1);
 	(void) printf("{guid: %llx creation_txg: %llu creation_time: "
 	    "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid,
 	    (u_longlong_t)prop.zbm_creation_txg,
 	    (u_longlong_t)prop.zbm_creation_time,
 	    (u_longlong_t)prop.zbm_redaction_obj);
 
 	IMPLY(print_list, print_redact);
 	if (!print_redact || prop.zbm_redaction_obj == 0)
 		return (0);
 
 	redaction_list_t *rl;
 	VERIFY0(dsl_redaction_list_hold_obj(dp,
 	    prop.zbm_redaction_obj, FTAG, &rl));
 
 	redaction_list_phys_t *rlp = rl->rl_phys;
 	(void) printf("\tRedacted:\n\t\tProgress: ");
 	if (rlp->rlp_last_object != UINT64_MAX ||
 	    rlp->rlp_last_blkid != UINT64_MAX) {
 		(void) printf("%llu %llu (incomplete)\n",
 		    (u_longlong_t)rlp->rlp_last_object,
 		    (u_longlong_t)rlp->rlp_last_blkid);
 	} else {
 		(void) printf("complete\n");
 	}
 	(void) printf("\t\tSnapshots: [");
 	for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) {
 		if (i > 0)
 			(void) printf(", ");
 		(void) printf("%0llu",
 		    (u_longlong_t)rlp->rlp_snaps[i]);
 	}
 	(void) printf("]\n\t\tLength: %llu\n",
 	    (u_longlong_t)rlp->rlp_num_entries);
 
 	if (!print_list) {
 		dsl_redaction_list_rele(rl, FTAG);
 		return (0);
 	}
 
 	if (rlp->rlp_num_entries == 0) {
 		dsl_redaction_list_rele(rl, FTAG);
 		(void) printf("\t\tRedaction List: []\n\n");
 		return (0);
 	}
 
 	redact_block_phys_t *rbp_buf;
 	uint64_t size;
 	dmu_object_info_t doi;
 
 	VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi));
 	size = doi.doi_max_offset;
 	rbp_buf = kmem_alloc(size, KM_SLEEP);
 
 	err = dmu_read(mos, prop.zbm_redaction_obj, 0, size,
 	    rbp_buf, 0);
 	if (err != 0) {
 		dsl_redaction_list_rele(rl, FTAG);
 		kmem_free(rbp_buf, size);
 		return (err);
 	}
 
 	(void) printf("\t\tRedaction List: [{object: %llx, offset: "
 	    "%llx, blksz: %x, count: %llx}",
 	    (u_longlong_t)rbp_buf[0].rbp_object,
 	    (u_longlong_t)rbp_buf[0].rbp_blkid,
 	    (uint_t)(redact_block_get_size(&rbp_buf[0])),
 	    (u_longlong_t)redact_block_get_count(&rbp_buf[0]));
 
 	for (size_t i = 1; i < rlp->rlp_num_entries; i++) {
 		(void) printf(",\n\t\t{object: %llx, offset: %llx, "
 		    "blksz: %x, count: %llx}",
 		    (u_longlong_t)rbp_buf[i].rbp_object,
 		    (u_longlong_t)rbp_buf[i].rbp_blkid,
 		    (uint_t)(redact_block_get_size(&rbp_buf[i])),
 		    (u_longlong_t)redact_block_get_count(&rbp_buf[i]));
 	}
 	dsl_redaction_list_rele(rl, FTAG);
 	kmem_free(rbp_buf, size);
 	(void) printf("]\n\n");
 	return (0);
 }
 
 static void
 dump_bookmarks(objset_t *os, int verbosity)
 {
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 	dsl_dataset_t *ds = dmu_objset_ds(os);
 	dsl_pool_t *dp = spa_get_dsl(os->os_spa);
 	objset_t *mos = os->os_spa->spa_meta_objset;
 	if (verbosity < 4)
 		return;
 	dsl_pool_config_enter(dp, FTAG);
 
 	for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    zap_cursor_advance(&zc)) {
 		char osname[ZFS_MAX_DATASET_NAME_LEN];
 		char buf[ZFS_MAX_DATASET_NAME_LEN];
 		int len;
 		dmu_objset_name(os, osname);
 		len = snprintf(buf, sizeof (buf), "%s#%s", osname,
 		    attr.za_name);
 		VERIFY3S(len, <, ZFS_MAX_DATASET_NAME_LEN);
 		(void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6);
 	}
 	zap_cursor_fini(&zc);
 	dsl_pool_config_exit(dp, FTAG);
 }
 
 static void
 bpobj_count_refd(bpobj_t *bpo)
 {
 	mos_obj_refd(bpo->bpo_object);
 
 	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
 		mos_obj_refd(bpo->bpo_phys->bpo_subobjs);
 		for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
 			uint64_t subobj;
 			bpobj_t subbpo;
 			int error;
 			VERIFY0(dmu_read(bpo->bpo_os,
 			    bpo->bpo_phys->bpo_subobjs,
 			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
 			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
 			if (error != 0) {
 				(void) printf("ERROR %u while trying to open "
 				    "subobj id %llu\n",
 				    error, (u_longlong_t)subobj);
 				continue;
 			}
 			bpobj_count_refd(&subbpo);
 			bpobj_close(&subbpo);
 		}
 	}
 }
 
 static int
 dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle)
 {
 	spa_t *spa = arg;
 	uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
 	if (dle->dle_bpobj.bpo_object != empty_bpobj)
 		bpobj_count_refd(&dle->dle_bpobj);
 	return (0);
 }
 
 static int
 dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle)
 {
 	ASSERT(arg == NULL);
 	if (dump_opt['d'] >= 5) {
 		char buf[128];
 		(void) snprintf(buf, sizeof (buf),
 		    "mintxg %llu -> obj %llu",
 		    (longlong_t)dle->dle_mintxg,
 		    (longlong_t)dle->dle_bpobj.bpo_object);
 
 		dump_full_bpobj(&dle->dle_bpobj, buf, 0);
 	} else {
 		(void) printf("mintxg %llu -> obj %llu\n",
 		    (longlong_t)dle->dle_mintxg,
 		    (longlong_t)dle->dle_bpobj.bpo_object);
 	}
 	return (0);
 }
 
 static void
 dump_blkptr_list(dsl_deadlist_t *dl, const char *name)
 {
 	char bytes[32];
 	char comp[32];
 	char uncomp[32];
 	char entries[32];
 	spa_t *spa = dmu_objset_spa(dl->dl_os);
 	uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
 
 	if (dl->dl_oldfmt) {
 		if (dl->dl_bpobj.bpo_object != empty_bpobj)
 			bpobj_count_refd(&dl->dl_bpobj);
 	} else {
 		mos_obj_refd(dl->dl_object);
 		dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa);
 	}
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
 	_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
 	_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
 	_Static_assert(sizeof (entries) >= NN_NUMBUF_SZ, "entries truncated");
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	if (dl->dl_oldfmt) {
 		dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
 		return;
 	}
 
 	zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
 	zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
 	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
 	zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries));
 	(void) printf("\n    %s: %s (%s/%s comp), %s entries\n",
 	    name, bytes, comp, uncomp, entries);
 
 	if (dump_opt['d'] < 4)
 		return;
 
 	(void) putchar('\n');
 
 	dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL);
 }
 
 static int
 verify_dd_livelist(objset_t *os)
 {
 	uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp;
 	dsl_pool_t *dp = spa_get_dsl(os->os_spa);
 	dsl_dir_t  *dd = os->os_dsl_dataset->ds_dir;
 
 	ASSERT(!dmu_objset_is_snapshot(os));
 	if (!dsl_deadlist_is_open(&dd->dd_livelist))
 		return (0);
 
 	/* Iterate through the livelist to check for duplicates */
 	dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight,
 	    NULL);
 
 	dsl_pool_config_enter(dp, FTAG);
 	dsl_deadlist_space(&dd->dd_livelist, &ll_used,
 	    &ll_comp, &ll_uncomp);
 
 	dsl_dataset_t *origin_ds;
 	ASSERT(dsl_pool_config_held(dp));
 	VERIFY0(dsl_dataset_hold_obj(dp,
 	    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds));
 	VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset,
 	    &used, &comp, &uncomp));
 	dsl_dataset_rele(origin_ds, FTAG);
 	dsl_pool_config_exit(dp, FTAG);
 	/*
 	 *  It's possible that the dataset's uncomp space is larger than the
 	 *  livelist's because livelists do not track embedded block pointers
 	 */
 	if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) {
 		char nice_used[32], nice_comp[32], nice_uncomp[32];
 		(void) printf("Discrepancy in space accounting:\n");
 		zdb_nicenum(used, nice_used, sizeof (nice_used));
 		zdb_nicenum(comp, nice_comp, sizeof (nice_comp));
 		zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp));
 		(void) printf("dir: used %s, comp %s, uncomp %s\n",
 		    nice_used, nice_comp, nice_uncomp);
 		zdb_nicenum(ll_used, nice_used, sizeof (nice_used));
 		zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp));
 		zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp));
 		(void) printf("livelist: used %s, comp %s, uncomp %s\n",
 		    nice_used, nice_comp, nice_uncomp);
 		return (1);
 	}
 	return (0);
 }
 
 static char *key_material = NULL;
 
 static boolean_t
 zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out)
 {
 	uint64_t keyformat, salt, iters;
 	int i;
 	unsigned char c;
 
 	VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
 	    zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t),
 	    1, &keyformat));
 
 	switch (keyformat) {
 	case ZFS_KEYFORMAT_HEX:
 		for (i = 0; i < WRAPPING_KEY_LEN * 2; i += 2) {
 			if (!isxdigit(key_material[i]) ||
 			    !isxdigit(key_material[i+1]))
 				return (B_FALSE);
 			if (sscanf(&key_material[i], "%02hhx", &c) != 1)
 				return (B_FALSE);
 			key_out[i / 2] = c;
 		}
 		break;
 
 	case ZFS_KEYFORMAT_PASSPHRASE:
 		VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset,
 		    dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT),
 		    sizeof (uint64_t), 1, &salt));
 		VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset,
 		    dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS),
 		    sizeof (uint64_t), 1, &iters));
 
 		if (PKCS5_PBKDF2_HMAC_SHA1(key_material, strlen(key_material),
 		    ((uint8_t *)&salt), sizeof (uint64_t), iters,
 		    WRAPPING_KEY_LEN, key_out) != 1)
 			return (B_FALSE);
 
 		break;
 
 	default:
 		fatal("no support for key format %u\n",
 		    (unsigned int) keyformat);
 	}
 
 	return (B_TRUE);
 }
 
 static char encroot[ZFS_MAX_DATASET_NAME_LEN];
 static boolean_t key_loaded = B_FALSE;
 
 static void
 zdb_load_key(objset_t *os)
 {
 	dsl_pool_t *dp;
 	dsl_dir_t *dd, *rdd;
 	uint8_t key[WRAPPING_KEY_LEN];
 	uint64_t rddobj;
 	int err;
 
 	dp = spa_get_dsl(os->os_spa);
 	dd = os->os_dsl_dataset->ds_dir;
 
 	dsl_pool_config_enter(dp, FTAG);
 	VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
 	    DSL_CRYPTO_KEY_ROOT_DDOBJ, sizeof (uint64_t), 1, &rddobj));
 	VERIFY0(dsl_dir_hold_obj(dd->dd_pool, rddobj, NULL, FTAG, &rdd));
 	dsl_dir_name(rdd, encroot);
 	dsl_dir_rele(rdd, FTAG);
 
 	if (!zdb_derive_key(dd, key))
 		fatal("couldn't derive encryption key");
 
 	dsl_pool_config_exit(dp, FTAG);
 
 	ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_UNAVAILABLE);
 
 	dsl_crypto_params_t *dcp;
 	nvlist_t *crypto_args;
 
 	crypto_args = fnvlist_alloc();
 	fnvlist_add_uint8_array(crypto_args, "wkeydata",
 	    (uint8_t *)key, WRAPPING_KEY_LEN);
 	VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
 	    NULL, crypto_args, &dcp));
 	err = spa_keystore_load_wkey(encroot, dcp, B_FALSE);
 
 	dsl_crypto_params_free(dcp, (err != 0));
 	fnvlist_free(crypto_args);
 
 	if (err != 0)
 		fatal(
 		    "couldn't load encryption key for %s: %s",
 		    encroot, err == ZFS_ERR_CRYPTO_NOTSUP ?
 		    "crypto params not supported" : strerror(err));
 
 	ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_AVAILABLE);
 
 	printf("Unlocked encryption root: %s\n", encroot);
 	key_loaded = B_TRUE;
 }
 
 static void
 zdb_unload_key(void)
 {
 	if (!key_loaded)
 		return;
 
 	VERIFY0(spa_keystore_unload_wkey(encroot));
 	key_loaded = B_FALSE;
 }
 
 static avl_tree_t idx_tree;
 static avl_tree_t domain_tree;
 static boolean_t fuid_table_loaded;
 static objset_t *sa_os = NULL;
 static sa_attr_type_t *sa_attr_table = NULL;
 
 static int
 open_objset(const char *path, const void *tag, objset_t **osp)
 {
 	int err;
 	uint64_t sa_attrs = 0;
 	uint64_t version = 0;
 
 	VERIFY3P(sa_os, ==, NULL);
 
 	/*
 	 * We can't own an objset if it's redacted.  Therefore, we do this
 	 * dance: hold the objset, then acquire a long hold on its dataset, then
 	 * release the pool (which is held as part of holding the objset).
 	 */
 
 	if (dump_opt['K']) {
 		/* decryption requested, try to load keys */
 		err = dmu_objset_hold(path, tag, osp);
 		if (err != 0) {
 			(void) fprintf(stderr, "failed to hold dataset "
 			    "'%s': %s\n",
 			    path, strerror(err));
 			return (err);
 		}
 		dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
 		dsl_pool_rele(dmu_objset_pool(*osp), tag);
 
 		/* succeeds or dies */
 		zdb_load_key(*osp);
 
 		/* release it all */
 		dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
 		dsl_dataset_rele(dmu_objset_ds(*osp), tag);
 	}
 
 	int ds_hold_flags = key_loaded ? DS_HOLD_FLAG_DECRYPT : 0;
 
 	err = dmu_objset_hold_flags(path, ds_hold_flags, tag, osp);
 	if (err != 0) {
 		(void) fprintf(stderr, "failed to hold dataset '%s': %s\n",
 		    path, strerror(err));
 		return (err);
 	}
 	dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
 	dsl_pool_rele(dmu_objset_pool(*osp), tag);
 
 	if (dmu_objset_type(*osp) == DMU_OST_ZFS &&
 	    (key_loaded || !(*osp)->os_encrypted)) {
 		(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
 		    8, 1, &version);
 		if (version >= ZPL_VERSION_SA) {
 			(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
 			    8, 1, &sa_attrs);
 		}
 		err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
 		    &sa_attr_table);
 		if (err != 0) {
 			(void) fprintf(stderr, "sa_setup failed: %s\n",
 			    strerror(err));
 			dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
 			dsl_dataset_rele_flags(dmu_objset_ds(*osp),
 			    ds_hold_flags, tag);
 			*osp = NULL;
 		}
 	}
 	sa_os = *osp;
 
 	return (err);
 }
 
 static void
 close_objset(objset_t *os, const void *tag)
 {
 	VERIFY3P(os, ==, sa_os);
 	if (os->os_sa != NULL)
 		sa_tear_down(os);
 	dsl_dataset_long_rele(dmu_objset_ds(os), tag);
 	dsl_dataset_rele_flags(dmu_objset_ds(os),
 	    key_loaded ? DS_HOLD_FLAG_DECRYPT : 0, tag);
 	sa_attr_table = NULL;
 	sa_os = NULL;
 
 	zdb_unload_key();
 }
 
 static void
 fuid_table_destroy(void)
 {
 	if (fuid_table_loaded) {
 		zfs_fuid_table_destroy(&idx_tree, &domain_tree);
 		fuid_table_loaded = B_FALSE;
 	}
 }
 
 /*
  * print uid or gid information.
  * For normal POSIX id just the id is printed in decimal format.
  * For CIFS files with FUID the fuid is printed in hex followed by
  * the domain-rid string.
  */
 static void
 print_idstr(uint64_t id, const char *id_type)
 {
 	if (FUID_INDEX(id)) {
 		const char *domain =
 		    zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
 		(void) printf("\t%s     %llx [%s-%d]\n", id_type,
 		    (u_longlong_t)id, domain, (int)FUID_RID(id));
 	} else {
 		(void) printf("\t%s     %llu\n", id_type, (u_longlong_t)id);
 	}
 
 }
 
 static void
 dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
 {
 	uint32_t uid_idx, gid_idx;
 
 	uid_idx = FUID_INDEX(uid);
 	gid_idx = FUID_INDEX(gid);
 
 	/* Load domain table, if not already loaded */
 	if (!fuid_table_loaded && (uid_idx || gid_idx)) {
 		uint64_t fuid_obj;
 
 		/* first find the fuid object.  It lives in the master node */
 		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
 		    8, 1, &fuid_obj) == 0);
 		zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
 		(void) zfs_fuid_table_load(os, fuid_obj,
 		    &idx_tree, &domain_tree);
 		fuid_table_loaded = B_TRUE;
 	}
 
 	print_idstr(uid, "uid");
 	print_idstr(gid, "gid");
 }
 
 static void
 dump_znode_sa_xattr(sa_handle_t *hdl)
 {
 	nvlist_t *sa_xattr;
 	nvpair_t *elem = NULL;
 	int sa_xattr_size = 0;
 	int sa_xattr_entries = 0;
 	int error;
 	char *sa_xattr_packed;
 
 	error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size);
 	if (error || sa_xattr_size == 0)
 		return;
 
 	sa_xattr_packed = malloc(sa_xattr_size);
 	if (sa_xattr_packed == NULL)
 		return;
 
 	error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR],
 	    sa_xattr_packed, sa_xattr_size);
 	if (error) {
 		free(sa_xattr_packed);
 		return;
 	}
 
 	error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0);
 	if (error) {
 		free(sa_xattr_packed);
 		return;
 	}
 
 	while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL)
 		sa_xattr_entries++;
 
 	(void) printf("\tSA xattrs: %d bytes, %d entries\n\n",
 	    sa_xattr_size, sa_xattr_entries);
 	while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) {
 		boolean_t can_print = !dump_opt['P'];
 		uchar_t *value;
 		uint_t cnt, idx;
 
 		(void) printf("\t\t%s = ", nvpair_name(elem));
 		nvpair_value_byte_array(elem, &value, &cnt);
 
 		for (idx = 0; idx < cnt; ++idx) {
 			if (!isprint(value[idx])) {
 				can_print = B_FALSE;
 				break;
 			}
 		}
 
 		for (idx = 0; idx < cnt; ++idx) {
 			if (can_print)
 				(void) putchar(value[idx]);
 			else
 				(void) printf("\\%3.3o", value[idx]);
 		}
 		(void) putchar('\n');
 	}
 
 	nvlist_free(sa_xattr);
 	free(sa_xattr_packed);
 }
 
 static void
 dump_znode_symlink(sa_handle_t *hdl)
 {
 	int sa_symlink_size = 0;
 	char linktarget[MAXPATHLEN];
 	int error;
 
 	error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size);
 	if (error || sa_symlink_size == 0) {
 		return;
 	}
 	if (sa_symlink_size >= sizeof (linktarget)) {
 		(void) printf("symlink size %d is too large\n",
 		    sa_symlink_size);
 		return;
 	}
 	linktarget[sa_symlink_size] = '\0';
 	if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK],
 	    &linktarget, sa_symlink_size) == 0)
 		(void) printf("\ttarget	%s\n", linktarget);
 }
 
 static void
 dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
 	sa_handle_t *hdl;
 	uint64_t xattr, rdev, gen;
 	uint64_t uid, gid, mode, fsize, parent, links;
 	uint64_t pflags;
 	uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
 	time_t z_crtime, z_atime, z_mtime, z_ctime;
 	sa_bulk_attr_t bulk[12];
 	int idx = 0;
 	int error;
 
 	VERIFY3P(os, ==, sa_os);
 	if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
 		(void) printf("Failed to get handle for SA znode\n");
 		return;
 	}
 
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
 	    &links, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
 	    &mode, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
 	    NULL, &parent, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
 	    &fsize, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
 	    acctm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
 	    modtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
 	    crtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
 	    chgtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
 	    &pflags, 8);
 
 	if (sa_bulk_lookup(hdl, bulk, idx)) {
 		(void) sa_handle_destroy(hdl);
 		return;
 	}
 
 	z_crtime = (time_t)crtm[0];
 	z_atime = (time_t)acctm[0];
 	z_mtime = (time_t)modtm[0];
 	z_ctime = (time_t)chgtm[0];
 
 	if (dump_opt['d'] > 4) {
 		error = zfs_obj_to_path(os, object, path, sizeof (path));
 		if (error == ESTALE) {
 			(void) snprintf(path, sizeof (path), "on delete queue");
 		} else if (error != 0) {
 			leaked_objects++;
 			(void) snprintf(path, sizeof (path),
 			    "path not found, possibly leaked");
 		}
 		(void) printf("\tpath	%s\n", path);
 	}
 
 	if (S_ISLNK(mode))
 		dump_znode_symlink(hdl);
 	dump_uidgid(os, uid, gid);
 	(void) printf("\tatime	%s", ctime(&z_atime));
 	(void) printf("\tmtime	%s", ctime(&z_mtime));
 	(void) printf("\tctime	%s", ctime(&z_ctime));
 	(void) printf("\tcrtime	%s", ctime(&z_crtime));
 	(void) printf("\tgen	%llu\n", (u_longlong_t)gen);
 	(void) printf("\tmode	%llo\n", (u_longlong_t)mode);
 	(void) printf("\tsize	%llu\n", (u_longlong_t)fsize);
 	(void) printf("\tparent	%llu\n", (u_longlong_t)parent);
 	(void) printf("\tlinks	%llu\n", (u_longlong_t)links);
 	(void) printf("\tpflags	%llx\n", (u_longlong_t)pflags);
 	if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) {
 		uint64_t projid;
 
 		if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid,
 		    sizeof (uint64_t)) == 0)
 			(void) printf("\tprojid	%llu\n", (u_longlong_t)projid);
 	}
 	if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
 	    sizeof (uint64_t)) == 0)
 		(void) printf("\txattr	%llu\n", (u_longlong_t)xattr);
 	if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
 	    sizeof (uint64_t)) == 0)
 		(void) printf("\trdev	0x%016llx\n", (u_longlong_t)rdev);
 	dump_znode_sa_xattr(hdl);
 	sa_handle_destroy(hdl);
 }
 
 static void
 dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static void
 dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
 	dump_none,		/* unallocated			*/
 	dump_zap,		/* object directory		*/
 	dump_uint64,		/* object array			*/
 	dump_none,		/* packed nvlist		*/
 	dump_packed_nvlist,	/* packed nvlist size		*/
 	dump_none,		/* bpobj			*/
 	dump_bpobj,		/* bpobj header			*/
 	dump_none,		/* SPA space map header		*/
 	dump_none,		/* SPA space map		*/
 	dump_none,		/* ZIL intent log		*/
 	dump_dnode,		/* DMU dnode			*/
 	dump_dmu_objset,	/* DMU objset			*/
 	dump_dsl_dir,		/* DSL directory		*/
 	dump_zap,		/* DSL directory child map	*/
 	dump_zap,		/* DSL dataset snap map		*/
 	dump_zap,		/* DSL props			*/
 	dump_dsl_dataset,	/* DSL dataset			*/
 	dump_znode,		/* ZFS znode			*/
 	dump_acl,		/* ZFS V0 ACL			*/
 	dump_uint8,		/* ZFS plain file		*/
 	dump_zpldir,		/* ZFS directory		*/
 	dump_zap,		/* ZFS master node		*/
 	dump_zap,		/* ZFS delete queue		*/
 	dump_uint8,		/* zvol object			*/
 	dump_zap,		/* zvol prop			*/
 	dump_uint8,		/* other uint8[]		*/
 	dump_uint64,		/* other uint64[]		*/
 	dump_zap,		/* other ZAP			*/
 	dump_zap,		/* persistent error log		*/
 	dump_uint8,		/* SPA history			*/
 	dump_history_offsets,	/* SPA history offsets		*/
 	dump_zap,		/* Pool properties		*/
 	dump_zap,		/* DSL permissions		*/
 	dump_acl,		/* ZFS ACL			*/
 	dump_uint8,		/* ZFS SYSACL			*/
 	dump_none,		/* FUID nvlist			*/
 	dump_packed_nvlist,	/* FUID nvlist size		*/
 	dump_zap,		/* DSL dataset next clones	*/
 	dump_zap,		/* DSL scrub queue		*/
 	dump_zap,		/* ZFS user/group/project used	*/
 	dump_zap,		/* ZFS user/group/project quota	*/
 	dump_zap,		/* snapshot refcount tags	*/
 	dump_ddt_zap,		/* DDT ZAP object		*/
 	dump_zap,		/* DDT statistics		*/
 	dump_znode,		/* SA object			*/
 	dump_zap,		/* SA Master Node		*/
 	dump_sa_attrs,		/* SA attribute registration	*/
 	dump_sa_layouts,	/* SA attribute layouts		*/
 	dump_zap,		/* DSL scrub translations	*/
 	dump_none,		/* fake dedup BP		*/
 	dump_zap,		/* deadlist			*/
 	dump_none,		/* deadlist hdr			*/
 	dump_zap,		/* dsl clones			*/
 	dump_bpobj_subobjs,	/* bpobj subobjs		*/
 	dump_unknown,		/* Unknown type, must be last	*/
 };
 
 static boolean_t
 match_object_type(dmu_object_type_t obj_type, uint64_t flags)
 {
 	boolean_t match = B_TRUE;
 
 	switch (obj_type) {
 	case DMU_OT_DIRECTORY_CONTENTS:
 		if (!(flags & ZOR_FLAG_DIRECTORY))
 			match = B_FALSE;
 		break;
 	case DMU_OT_PLAIN_FILE_CONTENTS:
 		if (!(flags & ZOR_FLAG_PLAIN_FILE))
 			match = B_FALSE;
 		break;
 	case DMU_OT_SPACE_MAP:
 		if (!(flags & ZOR_FLAG_SPACE_MAP))
 			match = B_FALSE;
 		break;
 	default:
 		if (strcmp(zdb_ot_name(obj_type), "zap") == 0) {
 			if (!(flags & ZOR_FLAG_ZAP))
 				match = B_FALSE;
 			break;
 		}
 
 		/*
 		 * If all bits except some of the supported flags are
 		 * set, the user combined the all-types flag (A) with
 		 * a negated flag to exclude some types (e.g. A-f to
 		 * show all object types except plain files).
 		 */
 		if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES)
 			match = B_FALSE;
 
 		break;
 	}
 
 	return (match);
 }
 
 static void
 dump_object(objset_t *os, uint64_t object, int verbosity,
     boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags)
 {
 	dmu_buf_t *db = NULL;
 	dmu_object_info_t doi;
 	dnode_t *dn;
 	boolean_t dnode_held = B_FALSE;
 	void *bonus = NULL;
 	size_t bsize = 0;
 	char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32];
 	char bonus_size[32];
 	char aux[50];
 	int error;
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (iblk) >= NN_NUMBUF_SZ, "iblk truncated");
 	_Static_assert(sizeof (dblk) >= NN_NUMBUF_SZ, "dblk truncated");
 	_Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, "lsize truncated");
 	_Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, "asize truncated");
 	_Static_assert(sizeof (bonus_size) >= NN_NUMBUF_SZ,
 	    "bonus_size truncated");
 
 	if (*print_header) {
 		(void) printf("\n%10s  %3s  %5s  %5s  %5s  %6s  %5s  %6s  %s\n",
 		    "Object", "lvl", "iblk", "dblk", "dsize", "dnsize",
 		    "lsize", "%full", "type");
 		*print_header = 0;
 	}
 
 	if (object == 0) {
 		dn = DMU_META_DNODE(os);
 		dmu_object_info_from_dnode(dn, &doi);
 	} else {
 		/*
 		 * Encrypted datasets will have sensitive bonus buffers
 		 * encrypted. Therefore we cannot hold the bonus buffer and
 		 * must hold the dnode itself instead.
 		 */
 		error = dmu_object_info(os, object, &doi);
 		if (error)
 			fatal("dmu_object_info() failed, errno %u", error);
 
 		if (!key_loaded && os->os_encrypted &&
 		    DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) {
 			error = dnode_hold(os, object, FTAG, &dn);
 			if (error)
 				fatal("dnode_hold() failed, errno %u", error);
 			dnode_held = B_TRUE;
 		} else {
 			error = dmu_bonus_hold(os, object, FTAG, &db);
 			if (error)
 				fatal("dmu_bonus_hold(%llu) failed, errno %u",
 				    object, error);
 			bonus = db->db_data;
 			bsize = db->db_size;
 			dn = DB_DNODE((dmu_buf_impl_t *)db);
 		}
 	}
 
 	/*
 	 * Default to showing all object types if no flags were specified.
 	 */
 	if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES &&
 	    !match_object_type(doi.doi_type, flags))
 		goto out;
 
 	if (dnode_slots_used)
 		*dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
 
 	zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
 	zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
 	zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
 	zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));
 	zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));
 	zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize));
 	(void) snprintf(fill, sizeof (fill), "%6.2f", 100.0 *
 	    doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ?
 	    DNODES_PER_BLOCK : 1) / doi.doi_max_offset);
 
 	aux[0] = '\0';
 
 	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
 		    " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum));
 	}
 
 	if (doi.doi_compress == ZIO_COMPRESS_INHERIT &&
 	    ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) {
 		const char *compname = NULL;
 		if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION,
 		    ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel),
 		    &compname) == 0) {
 			(void) snprintf(aux + strlen(aux),
 			    sizeof (aux) - strlen(aux), " (Z=inherit=%s)",
 			    compname);
 		} else {
 			(void) snprintf(aux + strlen(aux),
 			    sizeof (aux) - strlen(aux),
 			    " (Z=inherit=%s-unknown)",
 			    ZDB_COMPRESS_NAME(os->os_compress));
 		}
 	} else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
 		    " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress));
 	} else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
 		    " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress));
 	}
 
 	(void) printf("%10lld  %3u  %5s  %5s  %5s  %6s  %5s  %6s  %s%s\n",
 	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
 	    asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux);
 
 	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
 		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %5s  %6s  %s\n",
 		    "", "", "", "", "", "", bonus_size, "bonus",
 		    zdb_ot_name(doi.doi_bonus_type));
 	}
 
 	if (verbosity >= 4) {
 		(void) printf("\tdnode flags: %s%s%s%s\n",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
 		    "USED_BYTES " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
 		    "USERUSED_ACCOUNTED " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ?
 		    "USEROBJUSED_ACCOUNTED " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
 		    "SPILL_BLKPTR" : "");
 		(void) printf("\tdnode maxblkid: %llu\n",
 		    (longlong_t)dn->dn_phys->dn_maxblkid);
 
 		if (!dnode_held) {
 			object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os,
 			    object, bonus, bsize);
 		} else {
 			(void) printf("\t\t(bonus encrypted)\n");
 		}
 
 		if (key_loaded ||
 		    (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type))) {
 			object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object,
 			    NULL, 0);
 		} else {
 			(void) printf("\t\t(object encrypted)\n");
 		}
 
 		*print_header = B_TRUE;
 	}
 
 	if (verbosity >= 5) {
 		if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 			char blkbuf[BP_SPRINTF_LEN];
 			snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
 			    DN_SPILL_BLKPTR(dn->dn_phys), B_FALSE);
 			(void) printf("\nSpill block: %s\n", blkbuf);
 		}
 		dump_indirect(dn);
 	}
 
 	if (verbosity >= 5) {
 		/*
 		 * Report the list of segments that comprise the object.
 		 */
 		uint64_t start = 0;
 		uint64_t end;
 		uint64_t blkfill = 1;
 		int minlvl = 1;
 
 		if (dn->dn_type == DMU_OT_DNODE) {
 			minlvl = 0;
 			blkfill = DNODES_PER_BLOCK;
 		}
 
 		for (;;) {
 			char segsize[32];
 			/* make sure nicenum has enough space */
 			_Static_assert(sizeof (segsize) >= NN_NUMBUF_SZ,
 			    "segsize truncated");
 			error = dnode_next_offset(dn,
 			    0, &start, minlvl, blkfill, 0);
 			if (error)
 				break;
 			end = start;
 			error = dnode_next_offset(dn,
 			    DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
 			zdb_nicenum(end - start, segsize, sizeof (segsize));
 			(void) printf("\t\tsegment [%016llx, %016llx)"
 			    " size %5s\n", (u_longlong_t)start,
 			    (u_longlong_t)end, segsize);
 			if (error)
 				break;
 			start = end;
 		}
 	}
 
 out:
 	if (db != NULL)
 		dmu_buf_rele(db, FTAG);
 	if (dnode_held)
 		dnode_rele(dn, FTAG);
 }
 
 static void
 count_dir_mos_objects(dsl_dir_t *dd)
 {
 	mos_obj_refd(dd->dd_object);
 	mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj);
 	mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj);
 	mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj);
 	mos_obj_refd(dsl_dir_phys(dd)->dd_clones);
 
 	/*
 	 * The dd_crypto_obj can be referenced by multiple dsl_dir's.
 	 * Ignore the references after the first one.
 	 */
 	mos_obj_refd_multiple(dd->dd_crypto_obj);
 }
 
 static void
 count_ds_mos_objects(dsl_dataset_t *ds)
 {
 	mos_obj_refd(ds->ds_object);
 	mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj);
 	mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj);
 	mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj);
 	mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj);
 	mos_obj_refd(ds->ds_bookmarks_obj);
 
 	if (!dsl_dataset_is_snapshot(ds)) {
 		count_dir_mos_objects(ds->ds_dir);
 	}
 }
 
 static const char *const objset_types[DMU_OST_NUMTYPES] = {
 	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
 
 /*
  * Parse a string denoting a range of object IDs of the form
  * <start>[:<end>[:flags]], and store the results in zor.
  * Return 0 on success. On error, return 1 and update the msg
  * pointer to point to a descriptive error message.
  */
 static int
 parse_object_range(char *range, zopt_object_range_t *zor, const char **msg)
 {
 	uint64_t flags = 0;
 	char *p, *s, *dup, *flagstr, *tmp = NULL;
 	size_t len;
 	int i;
 	int rc = 0;
 
 	if (strchr(range, ':') == NULL) {
 		zor->zor_obj_start = strtoull(range, &p, 0);
 		if (*p != '\0') {
 			*msg = "Invalid characters in object ID";
 			rc = 1;
 		}
 		zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start);
 		zor->zor_obj_end = zor->zor_obj_start;
 		return (rc);
 	}
 
 	if (strchr(range, ':') == range) {
 		*msg = "Invalid leading colon";
 		rc = 1;
 		return (rc);
 	}
 
 	len = strlen(range);
 	if (range[len - 1] == ':') {
 		*msg = "Invalid trailing colon";
 		rc = 1;
 		return (rc);
 	}
 
 	dup = strdup(range);
 	s = strtok_r(dup, ":", &tmp);
 	zor->zor_obj_start = strtoull(s, &p, 0);
 
 	if (*p != '\0') {
 		*msg = "Invalid characters in start object ID";
 		rc = 1;
 		goto out;
 	}
 
 	s = strtok_r(NULL, ":", &tmp);
 	zor->zor_obj_end = strtoull(s, &p, 0);
 
 	if (*p != '\0') {
 		*msg = "Invalid characters in end object ID";
 		rc = 1;
 		goto out;
 	}
 
 	if (zor->zor_obj_start > zor->zor_obj_end) {
 		*msg = "Start object ID may not exceed end object ID";
 		rc = 1;
 		goto out;
 	}
 
 	s = strtok_r(NULL, ":", &tmp);
 	if (s == NULL) {
 		zor->zor_flags = ZOR_FLAG_ALL_TYPES;
 		goto out;
 	} else if (strtok_r(NULL, ":", &tmp) != NULL) {
 		*msg = "Invalid colon-delimited field after flags";
 		rc = 1;
 		goto out;
 	}
 
 	flagstr = s;
 	for (i = 0; flagstr[i]; i++) {
 		int bit;
 		boolean_t negation = (flagstr[i] == '-');
 
 		if (negation) {
 			i++;
 			if (flagstr[i] == '\0') {
 				*msg = "Invalid trailing negation operator";
 				rc = 1;
 				goto out;
 			}
 		}
 		bit = flagbits[(uchar_t)flagstr[i]];
 		if (bit == 0) {
 			*msg = "Invalid flag";
 			rc = 1;
 			goto out;
 		}
 		if (negation)
 			flags &= ~bit;
 		else
 			flags |= bit;
 	}
 	zor->zor_flags = flags;
 
 	zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start);
 	zor->zor_obj_end = ZDB_MAP_OBJECT_ID(zor->zor_obj_end);
 
 out:
 	free(dup);
 	return (rc);
 }
 
 static void
 dump_objset(objset_t *os)
 {
 	dmu_objset_stats_t dds = { 0 };
 	uint64_t object, object_count;
 	uint64_t refdbytes, usedobjs, scratch;
 	char numbuf[32];
 	char blkbuf[BP_SPRINTF_LEN + 20];
 	char osname[ZFS_MAX_DATASET_NAME_LEN];
 	const char *type = "UNKNOWN";
 	int verbosity = dump_opt['d'];
 	boolean_t print_header;
 	unsigned i;
 	int error;
 	uint64_t total_slots_used = 0;
 	uint64_t max_slot_used = 0;
 	uint64_t dnode_slots;
 	uint64_t obj_start;
 	uint64_t obj_end;
 	uint64_t flags;
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (numbuf) >= NN_NUMBUF_SZ, "numbuf truncated");
 
 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 	dmu_objset_fast_stat(os, &dds);
 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 
 	print_header = B_TRUE;
 
 	if (dds.dds_type < DMU_OST_NUMTYPES)
 		type = objset_types[dds.dds_type];
 
 	if (dds.dds_type == DMU_OST_META) {
 		dds.dds_creation_txg = TXG_INITIAL;
 		usedobjs = BP_GET_FILL(os->os_rootbp);
 		refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
 		    dd_used_bytes;
 	} else {
 		dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
 	}
 
 	ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
 
 	zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));
 
 	if (verbosity >= 4) {
 		(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
 		(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
 		    sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
 	} else {
 		blkbuf[0] = '\0';
 	}
 
 	dmu_objset_name(os, osname);
 
 	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
 	    "%s, %llu objects%s%s\n",
 	    osname, type, (u_longlong_t)dmu_objset_id(os),
 	    (u_longlong_t)dds.dds_creation_txg,
 	    numbuf, (u_longlong_t)usedobjs, blkbuf,
 	    (dds.dds_inconsistent) ? " (inconsistent)" : "");
 
 	for (i = 0; i < zopt_object_args; i++) {
 		obj_start = zopt_object_ranges[i].zor_obj_start;
 		obj_end = zopt_object_ranges[i].zor_obj_end;
 		flags = zopt_object_ranges[i].zor_flags;
 
 		object = obj_start;
 		if (object == 0 || obj_start == obj_end)
 			dump_object(os, object, verbosity, &print_header, NULL,
 			    flags);
 		else
 			object--;
 
 		while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) &&
 		    object <= obj_end) {
 			dump_object(os, object, verbosity, &print_header, NULL,
 			    flags);
 		}
 	}
 
 	if (zopt_object_args > 0) {
 		(void) printf("\n");
 		return;
 	}
 
 	if (dump_opt['i'] != 0 || verbosity >= 2)
 		dump_intent_log(dmu_objset_zil(os));
 
 	if (dmu_objset_ds(os) != NULL) {
 		dsl_dataset_t *ds = dmu_objset_ds(os);
 		dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
 		if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
 		    !dmu_objset_is_snapshot(os)) {
 			dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist");
 			if (verify_dd_livelist(os) != 0)
 				fatal("livelist is incorrect");
 		}
 
 		if (dsl_dataset_remap_deadlist_exists(ds)) {
 			(void) printf("ds_remap_deadlist:\n");
 			dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist");
 		}
 		count_ds_mos_objects(ds);
 	}
 
 	if (dmu_objset_ds(os) != NULL)
 		dump_bookmarks(os, verbosity);
 
 	if (verbosity < 2)
 		return;
 
 	if (BP_IS_HOLE(os->os_rootbp))
 		return;
 
 	dump_object(os, 0, verbosity, &print_header, NULL, 0);
 	object_count = 0;
 	if (DMU_USERUSED_DNODE(os) != NULL &&
 	    DMU_USERUSED_DNODE(os)->dn_type != 0) {
 		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
 		    NULL, 0);
 		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
 		    NULL, 0);
 	}
 
 	if (DMU_PROJECTUSED_DNODE(os) != NULL &&
 	    DMU_PROJECTUSED_DNODE(os)->dn_type != 0)
 		dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity,
 		    &print_header, NULL, 0);
 
 	object = 0;
 	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
 		dump_object(os, object, verbosity, &print_header, &dnode_slots,
 		    0);
 		object_count++;
 		total_slots_used += dnode_slots;
 		max_slot_used = object + dnode_slots - 1;
 	}
 
 	(void) printf("\n");
 
 	(void) printf("    Dnode slots:\n");
 	(void) printf("\tTotal used:    %10llu\n",
 	    (u_longlong_t)total_slots_used);
 	(void) printf("\tMax used:      %10llu\n",
 	    (u_longlong_t)max_slot_used);
 	(void) printf("\tPercent empty: %10lf\n",
 	    (double)(max_slot_used - total_slots_used)*100 /
 	    (double)max_slot_used);
 	(void) printf("\n");
 
 	if (error != ESRCH) {
 		(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
 		abort();
 	}
 
 	ASSERT3U(object_count, ==, usedobjs);
 
 	if (leaked_objects != 0) {
 		(void) printf("%d potentially leaked objects detected\n",
 		    leaked_objects);
 		leaked_objects = 0;
 	}
 }
 
 static void
 dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
 {
 	time_t timestamp = ub->ub_timestamp;
 
 	(void) printf("%s", header ? header : "");
 	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
 	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
 	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
 	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
 	(void) printf("\ttimestamp = %llu UTC = %s",
 	    (u_longlong_t)ub->ub_timestamp, ctime(&timestamp));
 
 	(void) printf("\tmmp_magic = %016llx\n",
 	    (u_longlong_t)ub->ub_mmp_magic);
 	if (MMP_VALID(ub)) {
 		(void) printf("\tmmp_delay = %0llu\n",
 		    (u_longlong_t)ub->ub_mmp_delay);
 		if (MMP_SEQ_VALID(ub))
 			(void) printf("\tmmp_seq = %u\n",
 			    (unsigned int) MMP_SEQ(ub));
 		if (MMP_FAIL_INT_VALID(ub))
 			(void) printf("\tmmp_fail = %u\n",
 			    (unsigned int) MMP_FAIL_INT(ub));
 		if (MMP_INTERVAL_VALID(ub))
 			(void) printf("\tmmp_write = %u\n",
 			    (unsigned int) MMP_INTERVAL(ub));
 		/* After MMP_* to make summarize_uberblock_mmp cleaner */
 		(void) printf("\tmmp_valid = %x\n",
 		    (unsigned int) ub->ub_mmp_config & 0xFF);
 	}
 
 	if (dump_opt['u'] >= 4) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
 		(void) printf("\trootbp = %s\n", blkbuf);
 	}
 	(void) printf("\tcheckpoint_txg = %llu\n",
 	    (u_longlong_t)ub->ub_checkpoint_txg);
 
 	(void) printf("\traidz_reflow state=%u off=%llu\n",
 	    (int)RRSS_GET_STATE(ub),
 	    (u_longlong_t)RRSS_GET_OFFSET(ub));
 
 	(void) printf("%s", footer ? footer : "");
 }
 
 static void
 dump_config(spa_t *spa)
 {
 	dmu_buf_t *db;
 	size_t nvsize = 0;
 	int error = 0;
 
 
 	error = dmu_bonus_hold(spa->spa_meta_objset,
 	    spa->spa_config_object, FTAG, &db);
 
 	if (error == 0) {
 		nvsize = *(uint64_t *)db->db_data;
 		dmu_buf_rele(db, FTAG);
 
 		(void) printf("\nMOS Configuration:\n");
 		dump_packed_nvlist(spa->spa_meta_objset,
 		    spa->spa_config_object, (void *)&nvsize, 1);
 	} else {
 		(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
 		    (u_longlong_t)spa->spa_config_object, error);
 	}
 }
 
 static void
 dump_cachefile(const char *cachefile)
 {
 	int fd;
 	struct stat64 statbuf;
 	char *buf;
 	nvlist_t *config;
 
 	if ((fd = open64(cachefile, O_RDONLY)) < 0) {
 		(void) printf("cannot open '%s': %s\n", cachefile,
 		    strerror(errno));
 		exit(1);
 	}
 
 	if (fstat64(fd, &statbuf) != 0) {
 		(void) printf("failed to stat '%s': %s\n", cachefile,
 		    strerror(errno));
 		exit(1);
 	}
 
 	if ((buf = malloc(statbuf.st_size)) == NULL) {
 		(void) fprintf(stderr, "failed to allocate %llu bytes\n",
 		    (u_longlong_t)statbuf.st_size);
 		exit(1);
 	}
 
 	if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
 		(void) fprintf(stderr, "failed to read %llu bytes\n",
 		    (u_longlong_t)statbuf.st_size);
 		exit(1);
 	}
 
 	(void) close(fd);
 
 	if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
 		(void) fprintf(stderr, "failed to unpack nvlist\n");
 		exit(1);
 	}
 
 	free(buf);
 
 	dump_nvlist(config, 0);
 
 	nvlist_free(config);
 }
 
 /*
  * ZFS label nvlist stats
  */
 typedef struct zdb_nvl_stats {
 	int		zns_list_count;
 	int		zns_leaf_count;
 	size_t		zns_leaf_largest;
 	size_t		zns_leaf_total;
 	nvlist_t	*zns_string;
 	nvlist_t	*zns_uint64;
 	nvlist_t	*zns_boolean;
 } zdb_nvl_stats_t;
 
 static void
 collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats)
 {
 	nvlist_t *list, **array;
 	nvpair_t *nvp = NULL;
 	const char *name;
 	uint_t i, items;
 
 	stats->zns_list_count++;
 
 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
 		name = nvpair_name(nvp);
 
 		switch (nvpair_type(nvp)) {
 		case DATA_TYPE_STRING:
 			fnvlist_add_string(stats->zns_string, name,
 			    fnvpair_value_string(nvp));
 			break;
 		case DATA_TYPE_UINT64:
 			fnvlist_add_uint64(stats->zns_uint64, name,
 			    fnvpair_value_uint64(nvp));
 			break;
 		case DATA_TYPE_BOOLEAN:
 			fnvlist_add_boolean(stats->zns_boolean, name);
 			break;
 		case DATA_TYPE_NVLIST:
 			if (nvpair_value_nvlist(nvp, &list) == 0)
 				collect_nvlist_stats(list, stats);
 			break;
 		case DATA_TYPE_NVLIST_ARRAY:
 			if (nvpair_value_nvlist_array(nvp, &array, &items) != 0)
 				break;
 
 			for (i = 0; i < items; i++) {
 				collect_nvlist_stats(array[i], stats);
 
 				/* collect stats on leaf vdev */
 				if (strcmp(name, "children") == 0) {
 					size_t size;
 
 					(void) nvlist_size(array[i], &size,
 					    NV_ENCODE_XDR);
 					stats->zns_leaf_total += size;
 					if (size > stats->zns_leaf_largest)
 						stats->zns_leaf_largest = size;
 					stats->zns_leaf_count++;
 				}
 			}
 			break;
 		default:
 			(void) printf("skip type %d!\n", (int)nvpair_type(nvp));
 		}
 	}
 }
 
 static void
 dump_nvlist_stats(nvlist_t *nvl, size_t cap)
 {
 	zdb_nvl_stats_t stats = { 0 };
 	size_t size, sum = 0, total;
 	size_t noise;
 
 	/* requires nvlist with non-unique names for stat collection */
 	VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0));
 	VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0));
 	VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0));
 	VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR));
 
 	(void) printf("\n\nZFS Label NVList Config Stats:\n");
 
 	VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR));
 	(void) printf("  %d bytes used, %d bytes free (using %4.1f%%)\n\n",
 	    (int)total, (int)(cap - total), 100.0 * total / cap);
 
 	collect_nvlist_stats(nvl, &stats);
 
 	VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR));
 	size -= noise;
 	sum += size;
 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:",
 	    (int)fnvlist_num_pairs(stats.zns_uint64),
 	    (int)size, 100.0 * size / total);
 
 	VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR));
 	size -= noise;
 	sum += size;
 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:",
 	    (int)fnvlist_num_pairs(stats.zns_string),
 	    (int)size, 100.0 * size / total);
 
 	VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR));
 	size -= noise;
 	sum += size;
 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:",
 	    (int)fnvlist_num_pairs(stats.zns_boolean),
 	    (int)size, 100.0 * size / total);
 
 	size = total - sum;	/* treat remainder as nvlist overhead */
 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:",
 	    stats.zns_list_count, (int)size, 100.0 * size / total);
 
 	if (stats.zns_leaf_count > 0) {
 		size_t average = stats.zns_leaf_total / stats.zns_leaf_count;
 
 		(void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:",
 		    stats.zns_leaf_count, (int)average);
 		(void) printf("%24d bytes largest\n",
 		    (int)stats.zns_leaf_largest);
 
 		if (dump_opt['l'] >= 3 && average > 0)
 			(void) printf("  space for %d additional leaf vdevs\n",
 			    (int)((cap - total) / average));
 	}
 	(void) printf("\n");
 
 	nvlist_free(stats.zns_string);
 	nvlist_free(stats.zns_uint64);
 	nvlist_free(stats.zns_boolean);
 }
 
 typedef struct cksum_record {
 	zio_cksum_t cksum;
 	boolean_t labels[VDEV_LABELS];
 	avl_node_t link;
 } cksum_record_t;
 
 static int
 cksum_record_compare(const void *x1, const void *x2)
 {
 	const cksum_record_t *l = (cksum_record_t *)x1;
 	const cksum_record_t *r = (cksum_record_t *)x2;
 	int arraysize = ARRAY_SIZE(l->cksum.zc_word);
 	int difference = 0;
 
 	for (int i = 0; i < arraysize; i++) {
 		difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]);
 		if (difference)
 			break;
 	}
 
 	return (difference);
 }
 
 static cksum_record_t *
 cksum_record_alloc(zio_cksum_t *cksum, int l)
 {
 	cksum_record_t *rec;
 
 	rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL);
 	rec->cksum = *cksum;
 	rec->labels[l] = B_TRUE;
 
 	return (rec);
 }
 
 static cksum_record_t *
 cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum)
 {
 	cksum_record_t lookup = { .cksum = *cksum };
 	avl_index_t where;
 
 	return (avl_find(tree, &lookup, &where));
 }
 
 static cksum_record_t *
 cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l)
 {
 	cksum_record_t *rec;
 
 	rec = cksum_record_lookup(tree, cksum);
 	if (rec) {
 		rec->labels[l] = B_TRUE;
 	} else {
 		rec = cksum_record_alloc(cksum, l);
 		avl_add(tree, rec);
 	}
 
 	return (rec);
 }
 
 static int
 first_label(cksum_record_t *rec)
 {
 	for (int i = 0; i < VDEV_LABELS; i++)
 		if (rec->labels[i])
 			return (i);
 
 	return (-1);
 }
 
 static void
 print_label_numbers(const char *prefix, const cksum_record_t *rec)
 {
 	fputs(prefix, stdout);
 	for (int i = 0; i < VDEV_LABELS; i++)
 		if (rec->labels[i] == B_TRUE)
 			printf("%d ", i);
 	putchar('\n');
 }
 
 #define	MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT)
 
 typedef struct zdb_label {
 	vdev_label_t label;
 	uint64_t label_offset;
 	nvlist_t *config_nv;
 	cksum_record_t *config;
 	cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT];
 	boolean_t header_printed;
 	boolean_t read_failed;
 	boolean_t cksum_valid;
 } zdb_label_t;
 
 static void
 print_label_header(zdb_label_t *label, int l)
 {
 
 	if (dump_opt['q'])
 		return;
 
 	if (label->header_printed == B_TRUE)
 		return;
 
 	(void) printf("------------------------------------\n");
 	(void) printf("LABEL %d %s\n", l,
 	    label->cksum_valid ? "" : "(Bad label cksum)");
 	(void) printf("------------------------------------\n");
 
 	label->header_printed = B_TRUE;
 }
 
 static void
 print_l2arc_header(void)
 {
 	(void) printf("------------------------------------\n");
 	(void) printf("L2ARC device header\n");
 	(void) printf("------------------------------------\n");
 }
 
 static void
 print_l2arc_log_blocks(void)
 {
 	(void) printf("------------------------------------\n");
 	(void) printf("L2ARC device log blocks\n");
 	(void) printf("------------------------------------\n");
 }
 
 static void
 dump_l2arc_log_entries(uint64_t log_entries,
     l2arc_log_ent_phys_t *le, uint64_t i)
 {
 	for (int j = 0; j < log_entries; j++) {
 		dva_t dva = le[j].le_dva;
 		(void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, "
 		    "vdev: %llu, offset: %llu\n",
 		    (u_longlong_t)i, j + 1,
 		    (u_longlong_t)DVA_GET_ASIZE(&dva),
 		    (u_longlong_t)DVA_GET_VDEV(&dva),
 		    (u_longlong_t)DVA_GET_OFFSET(&dva));
 		(void) printf("|\t\t\t\tbirth: %llu\n",
 		    (u_longlong_t)le[j].le_birth);
 		(void) printf("|\t\t\t\tlsize: %llu\n",
 		    (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tpsize: %llu\n",
 		    (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tcompr: %llu\n",
 		    (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tcomplevel: %llu\n",
 		    (u_longlong_t)(&le[j])->le_complevel);
 		(void) printf("|\t\t\t\ttype: %llu\n",
 		    (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tprotected: %llu\n",
 		    (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tprefetch: %llu\n",
 		    (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop));
 		(void) printf("|\t\t\t\taddress: %llu\n",
 		    (u_longlong_t)le[j].le_daddr);
 		(void) printf("|\t\t\t\tARC state: %llu\n",
 		    (u_longlong_t)L2BLK_GET_STATE((&le[j])->le_prop));
 		(void) printf("|\n");
 	}
 	(void) printf("\n");
 }
 
 static void
 dump_l2arc_log_blkptr(const l2arc_log_blkptr_t *lbps)
 {
 	(void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps->lbp_daddr);
 	(void) printf("|\t\tpayload_asize: %llu\n",
 	    (u_longlong_t)lbps->lbp_payload_asize);
 	(void) printf("|\t\tpayload_start: %llu\n",
 	    (u_longlong_t)lbps->lbp_payload_start);
 	(void) printf("|\t\tlsize: %llu\n",
 	    (u_longlong_t)L2BLK_GET_LSIZE(lbps->lbp_prop));
 	(void) printf("|\t\tasize: %llu\n",
 	    (u_longlong_t)L2BLK_GET_PSIZE(lbps->lbp_prop));
 	(void) printf("|\t\tcompralgo: %llu\n",
 	    (u_longlong_t)L2BLK_GET_COMPRESS(lbps->lbp_prop));
 	(void) printf("|\t\tcksumalgo: %llu\n",
 	    (u_longlong_t)L2BLK_GET_CHECKSUM(lbps->lbp_prop));
 	(void) printf("|\n\n");
 }
 
 static void
 dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
     l2arc_dev_hdr_phys_t *rebuild)
 {
 	l2arc_log_blk_phys_t this_lb;
 	uint64_t asize;
 	l2arc_log_blkptr_t lbps[2];
 	abd_t *abd;
 	zio_cksum_t cksum;
 	int failed = 0;
 	l2arc_dev_t dev;
 
 	if (!dump_opt['q'])
 		print_l2arc_log_blocks();
 	memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps));
 
 	dev.l2ad_evict = l2dhdr->dh_evict;
 	dev.l2ad_start = l2dhdr->dh_start;
 	dev.l2ad_end = l2dhdr->dh_end;
 
 	if (l2dhdr->dh_start_lbps[0].lbp_daddr == 0) {
 		/* no log blocks to read */
 		if (!dump_opt['q']) {
 			(void) printf("No log blocks to read\n");
 			(void) printf("\n");
 		}
 		return;
 	} else {
 		dev.l2ad_hand = lbps[0].lbp_daddr +
 		    L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
 	}
 
 	dev.l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
 
 	for (;;) {
 		if (!l2arc_log_blkptr_valid(&dev, &lbps[0]))
 			break;
 
 		/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 		asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
 		if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) {
 			if (!dump_opt['q']) {
 				(void) printf("Error while reading next log "
 				    "block\n\n");
 			}
 			break;
 		}
 
 		fletcher_4_native_varsize(&this_lb, asize, &cksum);
 		if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) {
 			failed++;
 			if (!dump_opt['q']) {
 				(void) printf("Invalid cksum\n");
 				dump_l2arc_log_blkptr(&lbps[0]);
 			}
 			break;
 		}
 
 		switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
 		case ZIO_COMPRESS_OFF:
 			break;
 		default:
 			abd = abd_alloc_for_io(asize, B_TRUE);
 			abd_copy_from_buf_off(abd, &this_lb, 0, asize);
 			if (zio_decompress_data(L2BLK_GET_COMPRESS(
 			    (&lbps[0])->lbp_prop), abd, &this_lb,
 			    asize, sizeof (this_lb), NULL) != 0) {
 				(void) printf("L2ARC block decompression "
 				    "failed\n");
 				abd_free(abd);
 				goto out;
 			}
 			abd_free(abd);
 			break;
 		}
 
 		if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
 			byteswap_uint64_array(&this_lb, sizeof (this_lb));
 		if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) {
 			if (!dump_opt['q'])
 				(void) printf("Invalid log block magic\n\n");
 			break;
 		}
 
 		rebuild->dh_lb_count++;
 		rebuild->dh_lb_asize += asize;
 		if (dump_opt['l'] > 1 && !dump_opt['q']) {
 			(void) printf("lb[%4llu]\tmagic: %llu\n",
 			    (u_longlong_t)rebuild->dh_lb_count,
 			    (u_longlong_t)this_lb.lb_magic);
 			dump_l2arc_log_blkptr(&lbps[0]);
 		}
 
 		if (dump_opt['l'] > 2 && !dump_opt['q'])
 			dump_l2arc_log_entries(l2dhdr->dh_log_entries,
 			    this_lb.lb_entries,
 			    rebuild->dh_lb_count);
 
 		if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
 		    lbps[0].lbp_payload_start, dev.l2ad_evict) &&
 		    !dev.l2ad_first)
 			break;
 
 		lbps[0] = lbps[1];
 		lbps[1] = this_lb.lb_prev_lbp;
 	}
 out:
 	if (!dump_opt['q']) {
 		(void) printf("log_blk_count:\t %llu with valid cksum\n",
 		    (u_longlong_t)rebuild->dh_lb_count);
 		(void) printf("\t\t %d with invalid cksum\n", failed);
 		(void) printf("log_blk_asize:\t %llu\n\n",
 		    (u_longlong_t)rebuild->dh_lb_asize);
 	}
 }
 
 static int
 dump_l2arc_header(int fd)
 {
 	l2arc_dev_hdr_phys_t l2dhdr = {0}, rebuild = {0};
 	int error = B_FALSE;
 
 	if (pread64(fd, &l2dhdr, sizeof (l2dhdr),
 	    VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) {
 		error = B_TRUE;
 	} else {
 		if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
 			byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr));
 
 		if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC)
 			error = B_TRUE;
 	}
 
 	if (error) {
 		(void) printf("L2ARC device header not found\n\n");
 		/* Do not return an error here for backward compatibility */
 		return (0);
 	} else if (!dump_opt['q']) {
 		print_l2arc_header();
 
 		(void) printf("    magic: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_magic);
 		(void) printf("    version: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_version);
 		(void) printf("    pool_guid: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_spa_guid);
 		(void) printf("    flags: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_flags);
 		(void) printf("    start_lbps[0]: %llu\n",
 		    (u_longlong_t)
 		    l2dhdr.dh_start_lbps[0].lbp_daddr);
 		(void) printf("    start_lbps[1]: %llu\n",
 		    (u_longlong_t)
 		    l2dhdr.dh_start_lbps[1].lbp_daddr);
 		(void) printf("    log_blk_ent: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_log_entries);
 		(void) printf("    start: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_start);
 		(void) printf("    end: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_end);
 		(void) printf("    evict: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_evict);
 		(void) printf("    lb_asize_refcount: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_lb_asize);
 		(void) printf("    lb_count_refcount: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_lb_count);
 		(void) printf("    trim_action_time: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_trim_action_time);
 		(void) printf("    trim_state: %llu\n\n",
 		    (u_longlong_t)l2dhdr.dh_trim_state);
 	}
 
 	dump_l2arc_log_blocks(fd, &l2dhdr, &rebuild);
 	/*
 	 * The total aligned size of log blocks and the number of log blocks
 	 * reported in the header of the device may be less than what zdb
 	 * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild().
 	 * This happens because dump_l2arc_log_blocks() lacks the memory
 	 * pressure valve that l2arc_rebuild() has. Thus, if we are on a system
 	 * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize
 	 * and dh_lb_count will be lower to begin with than what exists on the
 	 * device. This is normal and zdb should not exit with an error. The
 	 * opposite case should never happen though, the values reported in the
 	 * header should never be higher than what dump_l2arc_log_blocks() and
 	 * l2arc_rebuild() report. If this happens there is a leak in the
 	 * accounting of log blocks.
 	 */
 	if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize ||
 	    l2dhdr.dh_lb_count > rebuild.dh_lb_count)
 		return (1);
 
 	return (0);
 }
 
 static void
 dump_config_from_label(zdb_label_t *label, size_t buflen, int l)
 {
 	if (dump_opt['q'])
 		return;
 
 	if ((dump_opt['l'] < 3) && (first_label(label->config) != l))
 		return;
 
 	print_label_header(label, l);
 	dump_nvlist(label->config_nv, 4);
 	print_label_numbers("    labels = ", label->config);
 
 	if (dump_opt['l'] >= 2)
 		dump_nvlist_stats(label->config_nv, buflen);
 }
 
 #define	ZDB_MAX_UB_HEADER_SIZE 32
 
 static void
 dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num)
 {
 
 	vdev_t vd;
 	char header[ZDB_MAX_UB_HEADER_SIZE];
 
 	vd.vdev_ashift = ashift;
 	vd.vdev_top = &vd;
 
 	for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
 		uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
 		uberblock_t *ub = (void *)((char *)&label->label + uoff);
 		cksum_record_t *rec = label->uberblocks[i];
 
 		if (rec == NULL) {
 			if (dump_opt['u'] >= 2) {
 				print_label_header(label, label_num);
 				(void) printf("    Uberblock[%d] invalid\n", i);
 			}
 			continue;
 		}
 
 		if ((dump_opt['u'] < 3) && (first_label(rec) != label_num))
 			continue;
 
 		if ((dump_opt['u'] < 4) &&
 		    (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay &&
 		    (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL))
 			continue;
 
 		print_label_header(label, label_num);
 		(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
 		    "    Uberblock[%d]\n", i);
 		dump_uberblock(ub, header, "");
 		print_label_numbers("        labels = ", rec);
 	}
 }
 
 static char curpath[PATH_MAX];
 
 /*
  * Iterate through the path components, recursively passing
  * current one's obj and remaining path until we find the obj
  * for the last one.
  */
 static int
 dump_path_impl(objset_t *os, uint64_t obj, char *name, uint64_t *retobj)
 {
 	int err;
 	boolean_t header = B_TRUE;
 	uint64_t child_obj;
 	char *s;
 	dmu_buf_t *db;
 	dmu_object_info_t doi;
 
 	if ((s = strchr(name, '/')) != NULL)
 		*s = '\0';
 	err = zap_lookup(os, obj, name, 8, 1, &child_obj);
 
 	(void) strlcat(curpath, name, sizeof (curpath));
 
 	if (err != 0) {
 		(void) fprintf(stderr, "failed to lookup %s: %s\n",
 		    curpath, strerror(err));
 		return (err);
 	}
 
 	child_obj = ZFS_DIRENT_OBJ(child_obj);
 	err = sa_buf_hold(os, child_obj, FTAG, &db);
 	if (err != 0) {
 		(void) fprintf(stderr,
 		    "failed to get SA dbuf for obj %llu: %s\n",
 		    (u_longlong_t)child_obj, strerror(err));
 		return (EINVAL);
 	}
 	dmu_object_info_from_db(db, &doi);
 	sa_buf_rele(db, FTAG);
 
 	if (doi.doi_bonus_type != DMU_OT_SA &&
 	    doi.doi_bonus_type != DMU_OT_ZNODE) {
 		(void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
 		    doi.doi_bonus_type, (u_longlong_t)child_obj);
 		return (EINVAL);
 	}
 
 	if (dump_opt['v'] > 6) {
 		(void) printf("obj=%llu %s type=%d bonustype=%d\n",
 		    (u_longlong_t)child_obj, curpath, doi.doi_type,
 		    doi.doi_bonus_type);
 	}
 
 	(void) strlcat(curpath, "/", sizeof (curpath));
 
 	switch (doi.doi_type) {
 	case DMU_OT_DIRECTORY_CONTENTS:
 		if (s != NULL && *(s + 1) != '\0')
 			return (dump_path_impl(os, child_obj, s + 1, retobj));
 		zfs_fallthrough;
 	case DMU_OT_PLAIN_FILE_CONTENTS:
 		if (retobj != NULL) {
 			*retobj = child_obj;
 		} else {
 			dump_object(os, child_obj, dump_opt['v'], &header,
 			    NULL, 0);
 		}
 		return (0);
 	default:
 		(void) fprintf(stderr, "object %llu has non-file/directory "
 		    "type %d\n", (u_longlong_t)obj, doi.doi_type);
 		break;
 	}
 
 	return (EINVAL);
 }
 
 /*
  * Dump the blocks for the object specified by path inside the dataset.
  */
 static int
 dump_path(char *ds, char *path, uint64_t *retobj)
 {
 	int err;
 	objset_t *os;
 	uint64_t root_obj;
 
 	err = open_objset(ds, FTAG, &os);
 	if (err != 0)
 		return (err);
 
 	err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
 	if (err != 0) {
 		(void) fprintf(stderr, "can't lookup root znode: %s\n",
 		    strerror(err));
 		close_objset(os, FTAG);
 		return (EINVAL);
 	}
 
 	(void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);
 
 	err = dump_path_impl(os, root_obj, path, retobj);
 
 	close_objset(os, FTAG);
 	return (err);
 }
 
 static int
 dump_backup_bytes(objset_t *os, void *buf, int len, void *arg)
 {
 	const char *p = (const char *)buf;
 	ssize_t nwritten;
 
 	(void) os;
 	(void) arg;
 
 	/* Write the data out, handling short writes and signals. */
 	while ((nwritten = write(STDOUT_FILENO, p, len)) < len) {
 		if (nwritten < 0) {
 			if (errno == EINTR)
 				continue;
 			return (errno);
 		}
 		p += nwritten;
 		len -= nwritten;
 	}
 
 	return (0);
 }
 
 static void
 dump_backup(const char *pool, uint64_t objset_id, const char *flagstr)
 {
 	boolean_t embed = B_FALSE;
 	boolean_t large_block = B_FALSE;
 	boolean_t compress = B_FALSE;
 	boolean_t raw = B_FALSE;
 
 	const char *c;
 	for (c = flagstr; c != NULL && *c != '\0'; c++) {
 		switch (*c) {
 			case 'e':
 				embed = B_TRUE;
 				break;
 			case 'L':
 				large_block = B_TRUE;
 				break;
 			case 'c':
 				compress = B_TRUE;
 				break;
 			case 'w':
 				raw = B_TRUE;
 				break;
 			default:
 				fprintf(stderr, "dump_backup: invalid flag "
 				    "'%c'\n", *c);
 				return;
 		}
 	}
 
 	if (isatty(STDOUT_FILENO)) {
 		fprintf(stderr, "dump_backup: stream cannot be written "
 		    "to a terminal\n");
 		return;
 	}
 
 	offset_t off = 0;
 	dmu_send_outparams_t out = {
 	    .dso_outfunc = dump_backup_bytes,
 	    .dso_dryrun  = B_FALSE,
 	};
 
 	int err = dmu_send_obj(pool, objset_id, /* fromsnap */0, embed,
 	    large_block, compress, raw, /* saved */ B_FALSE, STDOUT_FILENO,
 	    &off, &out);
 	if (err != 0) {
 		fprintf(stderr, "dump_backup: dmu_send_obj: %s\n",
 		    strerror(err));
 		return;
 	}
 }
 
 static int
 zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile)
 {
 	int err = 0;
 	uint64_t size, readsize, oursize, offset;
 	ssize_t writesize;
 	sa_handle_t *hdl;
 
 	(void) printf("Copying object %" PRIu64 " to file %s\n", srcobj,
 	    destfile);
 
 	VERIFY3P(os, ==, sa_os);
 	if ((err = sa_handle_get(os, srcobj, NULL, SA_HDL_PRIVATE, &hdl))) {
 		(void) printf("Failed to get handle for SA znode\n");
 		return (err);
 	}
 	if ((err = sa_lookup(hdl, sa_attr_table[ZPL_SIZE], &size, 8))) {
 		(void) sa_handle_destroy(hdl);
 		return (err);
 	}
 	(void) sa_handle_destroy(hdl);
 
 	(void) printf("Object %" PRIu64 " is %" PRIu64 " bytes\n", srcobj,
 	    size);
 	if (size == 0) {
 		return (EINVAL);
 	}
 
 	int fd = open(destfile, O_WRONLY | O_CREAT | O_TRUNC, 0644);
 	if (fd == -1)
 		return (errno);
 	/*
 	 * We cap the size at 1 mebibyte here to prevent
 	 * allocation failures and nigh-infinite printing if the
 	 * object is extremely large.
 	 */
 	oursize = MIN(size, 1 << 20);
 	offset = 0;
 	char *buf = kmem_alloc(oursize, KM_NOSLEEP);
 	if (buf == NULL) {
 		(void) close(fd);
 		return (ENOMEM);
 	}
 
 	while (offset < size) {
 		readsize = MIN(size - offset, 1 << 20);
 		err = dmu_read(os, srcobj, offset, readsize, buf, 0);
 		if (err != 0) {
 			(void) printf("got error %u from dmu_read\n", err);
 			kmem_free(buf, oursize);
 			(void) close(fd);
 			return (err);
 		}
 		if (dump_opt['v'] > 3) {
 			(void) printf("Read offset=%" PRIu64 " size=%" PRIu64
 			    " error=%d\n", offset, readsize, err);
 		}
 
 		writesize = write(fd, buf, readsize);
 		if (writesize < 0) {
 			err = errno;
 			break;
 		} else if (writesize != readsize) {
 			/* Incomplete write */
 			(void) fprintf(stderr, "Short write, only wrote %llu of"
 			    " %" PRIu64 " bytes, exiting...\n",
 			    (u_longlong_t)writesize, readsize);
 			break;
 		}
 
 		offset += readsize;
 	}
 
 	(void) close(fd);
 
 	if (buf != NULL)
 		kmem_free(buf, oursize);
 
 	return (err);
 }
 
 static boolean_t
 label_cksum_valid(vdev_label_t *label, uint64_t offset)
 {
 	zio_checksum_info_t *ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL];
 	zio_cksum_t expected_cksum;
 	zio_cksum_t actual_cksum;
 	zio_cksum_t verifier;
 	zio_eck_t *eck;
 	int byteswap;
 
 	void *data = (char *)label + offsetof(vdev_label_t, vl_vdev_phys);
 	eck = (zio_eck_t *)((char *)(data) + VDEV_PHYS_SIZE) - 1;
 
 	offset += offsetof(vdev_label_t, vl_vdev_phys);
 	ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0);
 
 	byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
 	if (byteswap)
 		byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 
 	expected_cksum = eck->zec_cksum;
 	eck->zec_cksum = verifier;
 
 	abd_t *abd = abd_get_from_buf(data, VDEV_PHYS_SIZE);
 	ci->ci_func[byteswap](abd, VDEV_PHYS_SIZE, NULL, &actual_cksum);
 	abd_free(abd);
 
 	if (byteswap)
 		byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t));
 
 	if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
 		return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 static int
 dump_label(const char *dev)
 {
 	char path[MAXPATHLEN];
 	zdb_label_t labels[VDEV_LABELS] = {{{{0}}}};
 	uint64_t psize, ashift, l2cache;
 	struct stat64 statbuf;
 	boolean_t config_found = B_FALSE;
 	boolean_t error = B_FALSE;
 	boolean_t read_l2arc_header = B_FALSE;
 	avl_tree_t config_tree;
 	avl_tree_t uberblock_tree;
 	void *node, *cookie;
 	int fd;
 
 	/*
 	 * Check if we were given absolute path and use it as is.
 	 * Otherwise if the provided vdev name doesn't point to a file,
 	 * try prepending expected disk paths and partition numbers.
 	 */
 	(void) strlcpy(path, dev, sizeof (path));
 	if (dev[0] != '/' && stat64(path, &statbuf) != 0) {
 		int error;
 
 		error = zfs_resolve_shortname(dev, path, MAXPATHLEN);
 		if (error == 0 && zfs_dev_is_whole_disk(path)) {
 			if (zfs_append_partition(path, MAXPATHLEN) == -1)
 				error = ENOENT;
 		}
 
 		if (error || (stat64(path, &statbuf) != 0)) {
 			(void) printf("failed to find device %s, try "
 			    "specifying absolute path instead\n", dev);
 			return (1);
 		}
 	}
 
 	if ((fd = open64(path, O_RDONLY)) < 0) {
 		(void) printf("cannot open '%s': %s\n", path, strerror(errno));
 		exit(1);
 	}
 
 	if (fstat64_blk(fd, &statbuf) != 0) {
 		(void) printf("failed to stat '%s': %s\n", path,
 		    strerror(errno));
 		(void) close(fd);
 		exit(1);
 	}
 
 	if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0)
 		(void) printf("failed to invalidate cache '%s' : %s\n", path,
 		    strerror(errno));
 
 	avl_create(&config_tree, cksum_record_compare,
 	    sizeof (cksum_record_t), offsetof(cksum_record_t, link));
 	avl_create(&uberblock_tree, cksum_record_compare,
 	    sizeof (cksum_record_t), offsetof(cksum_record_t, link));
 
 	psize = statbuf.st_size;
 	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
 	ashift = SPA_MINBLOCKSHIFT;
 
 	/*
 	 * 1. Read the label from disk
 	 * 2. Verify label cksum
 	 * 3. Unpack the configuration and insert in config tree.
 	 * 4. Traverse all uberblocks and insert in uberblock tree.
 	 */
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		zdb_label_t *label = &labels[l];
 		char *buf = label->label.vl_vdev_phys.vp_nvlist;
 		size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
 		nvlist_t *config;
 		cksum_record_t *rec;
 		zio_cksum_t cksum;
 		vdev_t vd;
 
 		label->label_offset = vdev_label_offset(psize, l, 0);
 
 		if (pread64(fd, &label->label, sizeof (label->label),
 		    label->label_offset) != sizeof (label->label)) {
 			if (!dump_opt['q'])
 				(void) printf("failed to read label %d\n", l);
 			label->read_failed = B_TRUE;
 			error = B_TRUE;
 			continue;
 		}
 
 		label->read_failed = B_FALSE;
 		label->cksum_valid = label_cksum_valid(&label->label,
 		    label->label_offset);
 
 		if (nvlist_unpack(buf, buflen, &config, 0) == 0) {
 			nvlist_t *vdev_tree = NULL;
 			size_t size;
 
 			if ((nvlist_lookup_nvlist(config,
 			    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
 			    (nvlist_lookup_uint64(vdev_tree,
 			    ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
 				ashift = SPA_MINBLOCKSHIFT;
 
 			if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0)
 				size = buflen;
 
 			/* If the device is a cache device read the header. */
 			if (!read_l2arc_header) {
 				if (nvlist_lookup_uint64(config,
 				    ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 &&
 				    l2cache == POOL_STATE_L2CACHE) {
 					read_l2arc_header = B_TRUE;
 				}
 			}
 
 			fletcher_4_native_varsize(buf, size, &cksum);
 			rec = cksum_record_insert(&config_tree, &cksum, l);
 
 			label->config = rec;
 			label->config_nv = config;
 			config_found = B_TRUE;
 		} else {
 			error = B_TRUE;
 		}
 
 		vd.vdev_ashift = ashift;
 		vd.vdev_top = &vd;
 
 		for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
 			uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
 			uberblock_t *ub = (void *)((char *)label + uoff);
 
 			if (uberblock_verify(ub))
 				continue;
 
 			fletcher_4_native_varsize(ub, sizeof (*ub), &cksum);
 			rec = cksum_record_insert(&uberblock_tree, &cksum, l);
 
 			label->uberblocks[i] = rec;
 		}
 	}
 
 	/*
 	 * Dump the label and uberblocks.
 	 */
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		zdb_label_t *label = &labels[l];
 		size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
 
 		if (label->read_failed == B_TRUE)
 			continue;
 
 		if (label->config_nv) {
 			dump_config_from_label(label, buflen, l);
 		} else {
 			if (!dump_opt['q'])
 				(void) printf("failed to unpack label %d\n", l);
 		}
 
 		if (dump_opt['u'])
 			dump_label_uberblocks(label, ashift, l);
 
 		nvlist_free(label->config_nv);
 	}
 
 	/*
 	 * Dump the L2ARC header, if existent.
 	 */
 	if (read_l2arc_header)
 		error |= dump_l2arc_header(fd);
 
 	cookie = NULL;
 	while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL)
 		umem_free(node, sizeof (cksum_record_t));
 
 	cookie = NULL;
 	while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL)
 		umem_free(node, sizeof (cksum_record_t));
 
 	avl_destroy(&config_tree);
 	avl_destroy(&uberblock_tree);
 
 	(void) close(fd);
 
 	return (config_found == B_FALSE ? 2 :
 	    (error == B_TRUE ? 1 : 0));
 }
 
 static uint64_t dataset_feature_count[SPA_FEATURES];
 static uint64_t global_feature_count[SPA_FEATURES];
 static uint64_t remap_deadlist_count = 0;
 
 static int
 dump_one_objset(const char *dsname, void *arg)
 {
 	(void) arg;
 	int error;
 	objset_t *os;
 	spa_feature_t f;
 
 	error = open_objset(dsname, FTAG, &os);
 	if (error != 0)
 		return (0);
 
 	for (f = 0; f < SPA_FEATURES; f++) {
 		if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f))
 			continue;
 		ASSERT(spa_feature_table[f].fi_flags &
 		    ZFEATURE_FLAG_PER_DATASET);
 		dataset_feature_count[f]++;
 	}
 
 	if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) {
 		remap_deadlist_count++;
 	}
 
 	for (dsl_bookmark_node_t *dbn =
 	    avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL;
 	    dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) {
 		mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj);
 		if (dbn->dbn_phys.zbm_redaction_obj != 0) {
 			global_feature_count[
 			    SPA_FEATURE_REDACTION_BOOKMARKS]++;
 			objset_t *mos = os->os_spa->spa_meta_objset;
 			dnode_t *rl;
 			VERIFY0(dnode_hold(mos,
 			    dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl));
 			if (rl->dn_have_spill) {
 				global_feature_count[
 				    SPA_FEATURE_REDACTION_LIST_SPILL]++;
 			}
 		}
 		if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)
 			global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++;
 	}
 
 	if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) &&
 	    !dmu_objset_is_snapshot(os)) {
 		global_feature_count[SPA_FEATURE_LIVELIST]++;
 	}
 
 	dump_objset(os);
 	close_objset(os, FTAG);
 	fuid_table_destroy();
 	return (0);
 }
 
 /*
  * Block statistics.
  */
 #define	PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
 typedef struct zdb_blkstats {
 	uint64_t zb_asize;
 	uint64_t zb_lsize;
 	uint64_t zb_psize;
 	uint64_t zb_count;
 	uint64_t zb_gangs;
 	uint64_t zb_ditto_samevdev;
 	uint64_t zb_ditto_same_ms;
 	uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
 } zdb_blkstats_t;
 
 /*
  * Extended object types to report deferred frees and dedup auto-ditto blocks.
  */
 #define	ZDB_OT_DEFERRED	(DMU_OT_NUMTYPES + 0)
 #define	ZDB_OT_DITTO	(DMU_OT_NUMTYPES + 1)
 #define	ZDB_OT_OTHER	(DMU_OT_NUMTYPES + 2)
 #define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 3)
 
 static const char *zdb_ot_extname[] = {
 	"deferred free",
 	"dedup ditto",
 	"other",
 	"Total",
 };
 
 #define	ZB_TOTAL	DN_MAX_LEVELS
 #define	SPA_MAX_FOR_16M	(SPA_MAXBLOCKSHIFT+1)
 
 typedef struct zdb_brt_entry {
 	dva_t		zbre_dva;
 	uint64_t	zbre_refcount;
 	avl_node_t	zbre_node;
 } zdb_brt_entry_t;
 
 typedef struct zdb_cb {
 	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
 	uint64_t	zcb_removing_size;
 	uint64_t	zcb_checkpoint_size;
 	uint64_t	zcb_dedup_asize;
 	uint64_t	zcb_dedup_blocks;
 	uint64_t	zcb_clone_asize;
 	uint64_t	zcb_clone_blocks;
 	uint64_t	zcb_psize_count[SPA_MAX_FOR_16M];
 	uint64_t	zcb_lsize_count[SPA_MAX_FOR_16M];
 	uint64_t	zcb_asize_count[SPA_MAX_FOR_16M];
 	uint64_t	zcb_psize_len[SPA_MAX_FOR_16M];
 	uint64_t	zcb_lsize_len[SPA_MAX_FOR_16M];
 	uint64_t	zcb_asize_len[SPA_MAX_FOR_16M];
 	uint64_t	zcb_psize_total;
 	uint64_t	zcb_lsize_total;
 	uint64_t	zcb_asize_total;
 	uint64_t	zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
 	uint64_t	zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
 	    [BPE_PAYLOAD_SIZE + 1];
 	uint64_t	zcb_start;
 	hrtime_t	zcb_lastprint;
 	uint64_t	zcb_totalasize;
 	uint64_t	zcb_errors[256];
 	int		zcb_readfails;
 	int		zcb_haderrors;
 	spa_t		*zcb_spa;
 	uint32_t	**zcb_vd_obsolete_counts;
 	avl_tree_t	zcb_brt;
 	boolean_t	zcb_brt_is_active;
 } zdb_cb_t;
 
 /* test if two DVA offsets from same vdev are within the same metaslab */
 static boolean_t
 same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2)
 {
 	vdev_t *vd = vdev_lookup_top(spa, vdev);
 	uint64_t ms_shift = vd->vdev_ms_shift;
 
 	return ((off1 >> ms_shift) == (off2 >> ms_shift));
 }
 
 /*
  * Used to simplify reporting of the histogram data.
  */
 typedef struct one_histo {
 	const char *name;
 	uint64_t *count;
 	uint64_t *len;
 	uint64_t cumulative;
 } one_histo_t;
 
 /*
  * The number of separate histograms processed for psize, lsize and asize.
  */
 #define	NUM_HISTO 3
 
 /*
  * This routine will create a fixed column size output of three different
  * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M
  * the count, length and cumulative length of the psize, lsize and
  * asize blocks.
  *
  * All three types of blocks are listed on a single line
  *
  * By default the table is printed in nicenumber format (e.g. 123K) but
  * if the '-P' parameter is specified then the full raw number (parseable)
  * is printed out.
  */
 static void
 dump_size_histograms(zdb_cb_t *zcb)
 {
 	/*
 	 * A temporary buffer that allows us to convert a number into
 	 * a string using zdb_nicenumber to allow either raw or human
 	 * readable numbers to be output.
 	 */
 	char numbuf[32];
 
 	/*
 	 * Define titles which are used in the headers of the tables
 	 * printed by this routine.
 	 */
 	const char blocksize_title1[] = "block";
 	const char blocksize_title2[] = "size";
 	const char count_title[] = "Count";
 	const char length_title[] = "Size";
 	const char cumulative_title[] = "Cum.";
 
 	/*
 	 * Setup the histogram arrays (psize, lsize, and asize).
 	 */
 	one_histo_t parm_histo[NUM_HISTO];
 
 	parm_histo[0].name = "psize";
 	parm_histo[0].count = zcb->zcb_psize_count;
 	parm_histo[0].len = zcb->zcb_psize_len;
 	parm_histo[0].cumulative = 0;
 
 	parm_histo[1].name = "lsize";
 	parm_histo[1].count = zcb->zcb_lsize_count;
 	parm_histo[1].len = zcb->zcb_lsize_len;
 	parm_histo[1].cumulative = 0;
 
 	parm_histo[2].name = "asize";
 	parm_histo[2].count = zcb->zcb_asize_count;
 	parm_histo[2].len = zcb->zcb_asize_len;
 	parm_histo[2].cumulative = 0;
 
 
 	(void) printf("\nBlock Size Histogram\n");
 	/*
 	 * Print the first line titles
 	 */
 	if (dump_opt['P'])
 		(void) printf("\n%s\t", blocksize_title1);
 	else
 		(void) printf("\n%7s   ", blocksize_title1);
 
 	for (int j = 0; j < NUM_HISTO; j++) {
 		if (dump_opt['P']) {
 			if (j < NUM_HISTO - 1) {
 				(void) printf("%s\t\t\t", parm_histo[j].name);
 			} else {
 				/* Don't print trailing spaces */
 				(void) printf("  %s", parm_histo[j].name);
 			}
 		} else {
 			if (j < NUM_HISTO - 1) {
 				/* Left aligned strings in the output */
 				(void) printf("%-7s              ",
 				    parm_histo[j].name);
 			} else {
 				/* Don't print trailing spaces */
 				(void) printf("%s", parm_histo[j].name);
 			}
 		}
 	}
 	(void) printf("\n");
 
 	/*
 	 * Print the second line titles
 	 */
 	if (dump_opt['P']) {
 		(void) printf("%s\t", blocksize_title2);
 	} else {
 		(void) printf("%7s ", blocksize_title2);
 	}
 
 	for (int i = 0; i < NUM_HISTO; i++) {
 		if (dump_opt['P']) {
 			(void) printf("%s\t%s\t%s\t",
 			    count_title, length_title, cumulative_title);
 		} else {
 			(void) printf("%7s%7s%7s",
 			    count_title, length_title, cumulative_title);
 		}
 	}
 	(void) printf("\n");
 
 	/*
 	 * Print the rows
 	 */
 	for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) {
 
 		/*
 		 * Print the first column showing the blocksize
 		 */
 		zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf));
 
 		if (dump_opt['P']) {
 			printf("%s", numbuf);
 		} else {
 			printf("%7s:", numbuf);
 		}
 
 		/*
 		 * Print the remaining set of 3 columns per size:
 		 * for psize, lsize and asize
 		 */
 		for (int j = 0; j < NUM_HISTO; j++) {
 			parm_histo[j].cumulative += parm_histo[j].len[i];
 
 			zdb_nicenum(parm_histo[j].count[i],
 			    numbuf, sizeof (numbuf));
 			if (dump_opt['P'])
 				(void) printf("\t%s", numbuf);
 			else
 				(void) printf("%7s", numbuf);
 
 			zdb_nicenum(parm_histo[j].len[i],
 			    numbuf, sizeof (numbuf));
 			if (dump_opt['P'])
 				(void) printf("\t%s", numbuf);
 			else
 				(void) printf("%7s", numbuf);
 
 			zdb_nicenum(parm_histo[j].cumulative,
 			    numbuf, sizeof (numbuf));
 			if (dump_opt['P'])
 				(void) printf("\t%s", numbuf);
 			else
 				(void) printf("%7s", numbuf);
 		}
 		(void) printf("\n");
 	}
 }
 
 static void
 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
     dmu_object_type_t type)
 {
 	uint64_t refcnt = 0;
 	int i;
 
 	ASSERT(type < ZDB_OT_TOTAL);
 
 	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
 		return;
 
 	spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
 
 	for (i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
 		int t = (i & 1) ? type : ZDB_OT_TOTAL;
 		int equal;
 		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
 
 		zb->zb_asize += BP_GET_ASIZE(bp);
 		zb->zb_lsize += BP_GET_LSIZE(bp);
 		zb->zb_psize += BP_GET_PSIZE(bp);
 		zb->zb_count++;
 
 		/*
 		 * The histogram is only big enough to record blocks up to
 		 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
 		 * "other", bucket.
 		 */
 		unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
 		idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
 		zb->zb_psize_histogram[idx]++;
 
 		zb->zb_gangs += BP_COUNT_GANG(bp);
 
 		switch (BP_GET_NDVAS(bp)) {
 		case 2:
 			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1])) {
 				zb->zb_ditto_samevdev++;
 
 				if (same_metaslab(zcb->zcb_spa,
 				    DVA_GET_VDEV(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[1])))
 					zb->zb_ditto_same_ms++;
 			}
 			break;
 		case 3:
 			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2]));
 			if (equal != 0) {
 				zb->zb_ditto_samevdev++;
 
 				if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 				    DVA_GET_VDEV(&bp->blk_dva[1]) &&
 				    same_metaslab(zcb->zcb_spa,
 				    DVA_GET_VDEV(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[1])))
 					zb->zb_ditto_same_ms++;
 				else if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 				    DVA_GET_VDEV(&bp->blk_dva[2]) &&
 				    same_metaslab(zcb->zcb_spa,
 				    DVA_GET_VDEV(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[2])))
 					zb->zb_ditto_same_ms++;
 				else if (DVA_GET_VDEV(&bp->blk_dva[1]) ==
 				    DVA_GET_VDEV(&bp->blk_dva[2]) &&
 				    same_metaslab(zcb->zcb_spa,
 				    DVA_GET_VDEV(&bp->blk_dva[1]),
 				    DVA_GET_OFFSET(&bp->blk_dva[1]),
 				    DVA_GET_OFFSET(&bp->blk_dva[2])))
 					zb->zb_ditto_same_ms++;
 			}
 			break;
 		}
 	}
 
 	spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);
 
 	if (BP_IS_EMBEDDED(bp)) {
 		zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
 		zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
 		    [BPE_GET_PSIZE(bp)]++;
 		return;
 	}
 	/*
 	 * The binning histogram bins by powers of two up to
 	 * SPA_MAXBLOCKSIZE rather than creating bins for
 	 * every possible blocksize found in the pool.
 	 */
 	int bin = highbit64(BP_GET_PSIZE(bp)) - 1;
 
 	zcb->zcb_psize_count[bin]++;
 	zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp);
 	zcb->zcb_psize_total += BP_GET_PSIZE(bp);
 
 	bin = highbit64(BP_GET_LSIZE(bp)) - 1;
 
 	zcb->zcb_lsize_count[bin]++;
 	zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp);
 	zcb->zcb_lsize_total += BP_GET_LSIZE(bp);
 
 	bin = highbit64(BP_GET_ASIZE(bp)) - 1;
 
 	zcb->zcb_asize_count[bin]++;
 	zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
 	zcb->zcb_asize_total += BP_GET_ASIZE(bp);
 
 	if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) {
 		/*
 		 * Cloned blocks are special. We need to count them, so we can
 		 * later uncount them when reporting leaked space, and we must
 		 * only claim them them once.
 		 *
 		 * To do this, we keep our own in-memory BRT. For each block
 		 * we haven't seen before, we look it up in the real BRT and
 		 * if its there, we note it and its refcount then proceed as
 		 * normal. If we see the block again, we count it as a clone
 		 * and then give it no further consideration.
 		 */
 		zdb_brt_entry_t zbre_search, *zbre;
 		avl_index_t where;
 
 		zbre_search.zbre_dva = bp->blk_dva[0];
 		zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
 		if (zbre != NULL) {
 			zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
 			zcb->zcb_clone_blocks++;
 
 			zbre->zbre_refcount--;
 			if (zbre->zbre_refcount == 0) {
 				avl_remove(&zcb->zcb_brt, zbre);
 				umem_free(zbre, sizeof (zdb_brt_entry_t));
 			}
 			return;
 		}
 
 		uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp);
 		if (crefcnt > 0) {
 			zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
 			    UMEM_NOFAIL);
 			zbre->zbre_dva = bp->blk_dva[0];
 			zbre->zbre_refcount = crefcnt;
 			avl_insert(&zcb->zcb_brt, zbre, where);
 		}
 	}
 
 	if (dump_opt['L'])
 		return;
 
 	if (BP_GET_DEDUP(bp)) {
 		ddt_t *ddt;
 		ddt_entry_t *dde;
 
 		ddt = ddt_select(zcb->zcb_spa, bp);
 		ddt_enter(ddt);
 		dde = ddt_lookup(ddt, bp, B_FALSE);
 
 		if (dde == NULL) {
 			refcnt = 0;
 		} else {
 			ddt_phys_t *ddp = ddt_phys_select(dde, bp);
 			ddt_phys_decref(ddp);
 			refcnt = ddp->ddp_refcnt;
 			if (ddt_phys_total_refcnt(dde) == 0)
 				ddt_remove(ddt, dde);
 		}
 		ddt_exit(ddt);
 	}
 
 	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
 	    refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
 	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
 }
 
 static void
 zdb_blkptr_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	int ioerr = zio->io_error;
 	zdb_cb_t *zcb = zio->io_private;
 	zbookmark_phys_t *zb = &zio->io_bookmark;
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
 	cv_broadcast(&spa->spa_scrub_io_cv);
 
 	if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 		char blkbuf[BP_SPRINTF_LEN];
 
 		zcb->zcb_haderrors = 1;
 		zcb->zcb_errors[ioerr]++;
 
 		if (dump_opt['b'] >= 2)
 			snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		else
 			blkbuf[0] = '\0';
 
 		(void) printf("zdb_blkptr_cb: "
 		    "Got error %d reading "
 		    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
 		    ioerr,
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)zb->zb_object,
 		    (u_longlong_t)zb->zb_level,
 		    (u_longlong_t)zb->zb_blkid,
 		    blkbuf);
 	}
 	mutex_exit(&spa->spa_scrub_lock);
 
 	abd_free(zio->io_abd);
 }
 
 static int
 zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	zdb_cb_t *zcb = arg;
 	dmu_object_type_t type;
 	boolean_t is_metadata;
 
 	if (zb->zb_level == ZB_DNODE_LEVEL)
 		return (0);
 
-	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
+	if (dump_opt['b'] >= 5 && BP_GET_LOGICAL_BIRTH(bp) > 0) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("objset %llu object %llu "
 		    "level %lld offset 0x%llx %s\n",
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)zb->zb_object,
 		    (longlong_t)zb->zb_level,
 		    (u_longlong_t)blkid2offset(dnp, bp, zb),
 		    blkbuf);
 	}
 
 	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
 		return (0);
 
 	type = BP_GET_TYPE(bp);
 
 	zdb_count_block(zcb, zilog, bp,
 	    (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
 
 	is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
 
 	if (!BP_IS_EMBEDDED(bp) &&
 	    (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
 		size_t size = BP_GET_PSIZE(bp);
 		abd_t *abd = abd_alloc(size, B_FALSE);
 		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
 
 		/* If it's an intent log block, failure is expected. */
 		if (zb->zb_level == ZB_ZIL_LEVEL)
 			flags |= ZIO_FLAG_SPECULATIVE;
 
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_load_verify_bytes > max_inflight_bytes)
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 		spa->spa_load_verify_bytes += size;
 		mutex_exit(&spa->spa_scrub_lock);
 
 		zio_nowait(zio_read(NULL, spa, bp, abd, size,
 		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
 	}
 
 	zcb->zcb_readfails = 0;
 
 	/* only call gethrtime() every 100 blocks */
 	static int iters;
 	if (++iters > 100)
 		iters = 0;
 	else
 		return (0);
 
 	if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
 		uint64_t now = gethrtime();
 		char buf[10];
 		uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
 		uint64_t kb_per_sec =
 		    1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
 		uint64_t sec_remaining =
 		    (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
 
 		/* make sure nicenum has enough space */
 		_Static_assert(sizeof (buf) >= NN_NUMBUF_SZ, "buf truncated");
 
 		zfs_nicebytes(bytes, buf, sizeof (buf));
 		(void) fprintf(stderr,
 		    "\r%5s completed (%4"PRIu64"MB/s) "
 		    "estimated time remaining: "
 		    "%"PRIu64"hr %02"PRIu64"min %02"PRIu64"sec        ",
 		    buf, kb_per_sec / 1024,
 		    sec_remaining / 60 / 60,
 		    sec_remaining / 60 % 60,
 		    sec_remaining % 60);
 
 		zcb->zcb_lastprint = now;
 	}
 
 	return (0);
 }
 
 static void
 zdb_leak(void *arg, uint64_t start, uint64_t size)
 {
 	vdev_t *vd = arg;
 
 	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
 }
 
 static metaslab_ops_t zdb_metaslab_ops = {
 	NULL	/* alloc */
 };
 
 static int
 load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme,
     uint64_t txg, void *arg)
 {
 	spa_vdev_removal_t *svr = arg;
 
 	uint64_t offset = sme->sme_offset;
 	uint64_t size = sme->sme_run;
 
 	/* skip vdevs we don't care about */
 	if (sme->sme_vdev != svr->svr_vdev_id)
 		return (0);
 
 	vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev);
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 	if (sme->sme_type == SM_ALLOC)
 		range_tree_add(svr->svr_allocd_segs, offset, size);
 	else
 		range_tree_remove(svr->svr_allocd_segs, offset, size);
 
 	return (0);
 }
 
 static void
 claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	(void) inner_offset, (void) arg;
 
 	/*
 	 * This callback was called through a remap from
 	 * a device being removed. Therefore, the vdev that
 	 * this callback is applied to is a concrete
 	 * vdev.
 	 */
 	ASSERT(vdev_is_concrete(vd));
 
 	VERIFY0(metaslab_claim_impl(vd, offset, size,
 	    spa_min_claim_txg(vd->vdev_spa)));
 }
 
 static void
 claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
 {
 	vdev_t *vd = arg;
 
 	vdev_indirect_ops.vdev_op_remap(vd, offset, size,
 	    claim_segment_impl_cb, NULL);
 }
 
 /*
  * After accounting for all allocated blocks that are directly referenced,
  * we might have missed a reference to a block from a partially complete
  * (and thus unused) indirect mapping object. We perform a secondary pass
  * through the metaslabs we have already mapped and claim the destination
  * blocks.
  */
 static void
 zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
 {
 	if (dump_opt['L'])
 		return;
 
 	if (spa->spa_vdev_removal == NULL)
 		return;
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 	ASSERT0(range_tree_space(svr->svr_allocd_segs));
 
 	range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
 	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
 		metaslab_t *msp = vd->vdev_ms[msi];
 
 		ASSERT0(range_tree_space(allocs));
 		if (msp->ms_sm != NULL)
 			VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC));
 		range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs);
 	}
 	range_tree_destroy(allocs);
 
 	iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr);
 
 	/*
 	 * Clear everything past what has been synced,
 	 * because we have not allocated mappings for
 	 * it yet.
 	 */
 	range_tree_clear(svr->svr_allocd_segs,
 	    vdev_indirect_mapping_max_offset(vim),
 	    vd->vdev_asize - vdev_indirect_mapping_max_offset(vim));
 
 	zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs);
 	range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 static int
 increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	(void) tx;
 	zdb_cb_t *zcb = arg;
 	spa_t *spa = zcb->zcb_spa;
 	vdev_t *vd;
 	const dva_t *dva = &bp->blk_dva[0];
 
 	ASSERT(!bp_freed);
 	ASSERT(!dump_opt['L']);
 	ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva));
 	ASSERT3P(vd, !=, NULL);
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
 	ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL);
 
 	vdev_indirect_mapping_increment_obsolete_count(
 	    vd->vdev_indirect_mapping,
 	    DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva),
 	    zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
 
 	return (0);
 }
 
 static uint32_t *
 zdb_load_obsolete_counts(vdev_t *vd)
 {
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	spa_t *spa = vd->vdev_spa;
 	spa_condensing_indirect_phys_t *scip =
 	    &spa->spa_condensing_indirect_phys;
 	uint64_t obsolete_sm_object;
 	uint32_t *counts;
 
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL);
 	counts = vdev_indirect_mapping_load_obsolete_counts(vim);
 	if (vd->vdev_obsolete_sm != NULL) {
 		vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
 		    vd->vdev_obsolete_sm);
 	}
 	if (scip->scip_vdev == vd->vdev_id &&
 	    scip->scip_prev_obsolete_sm_object != 0) {
 		space_map_t *prev_obsolete_sm = NULL;
 		VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
 		    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
 		vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
 		    prev_obsolete_sm);
 		space_map_close(prev_obsolete_sm);
 	}
 	return (counts);
 }
 
 static void
 zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
 	ddt_bookmark_t ddb = {0};
 	ddt_entry_t dde;
 	int error;
 	int p;
 
 	ASSERT(!dump_opt['L']);
 
 	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
 		blkptr_t blk;
 		ddt_phys_t *ddp = dde.dde_phys;
 
 		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
 			return;
 
 		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
 		ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
 		VERIFY(ddt);
 
 		for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 			if (ddp->ddp_phys_birth == 0)
 				continue;
 			ddt_bp_create(ddb.ddb_checksum,
 			    &dde.dde_key, ddp, &blk);
 			if (p == DDT_PHYS_DITTO) {
 				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
 			} else {
 				zcb->zcb_dedup_asize +=
 				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
 				zcb->zcb_dedup_blocks++;
 			}
 		}
 
 		ddt_enter(ddt);
 		VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
 		ddt_exit(ddt);
 	}
 
 	ASSERT(error == ENOENT);
 }
 
 typedef struct checkpoint_sm_exclude_entry_arg {
 	vdev_t *cseea_vd;
 	uint64_t cseea_checkpoint_size;
 } checkpoint_sm_exclude_entry_arg_t;
 
 static int
 checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
 {
 	checkpoint_sm_exclude_entry_arg_t *cseea = arg;
 	vdev_t *vd = cseea->cseea_vd;
 	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
 	uint64_t end = sme->sme_offset + sme->sme_run;
 
 	ASSERT(sme->sme_type == SM_FREE);
 
 	/*
 	 * Since the vdev_checkpoint_sm exists in the vdev level
 	 * and the ms_sm space maps exist in the metaslab level,
 	 * an entry in the checkpoint space map could theoretically
 	 * cross the boundaries of the metaslab that it belongs.
 	 *
 	 * In reality, because of the way that we populate and
 	 * manipulate the checkpoint's space maps currently,
 	 * there shouldn't be any entries that cross metaslabs.
 	 * Hence the assertion below.
 	 *
 	 * That said, there is no fundamental requirement that
 	 * the checkpoint's space map entries should not cross
 	 * metaslab boundaries. So if needed we could add code
 	 * that handles metaslab-crossing segments in the future.
 	 */
 	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
 	 * By removing the entry from the allocated segments we
 	 * also verify that the entry is there to begin with.
 	 */
 	mutex_enter(&ms->ms_lock);
 	range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
 	cseea->cseea_checkpoint_size += sme->sme_run;
 	return (0);
 }
 
 static void
 zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
 {
 	spa_t *spa = vd->vdev_spa;
 	space_map_t *checkpoint_sm = NULL;
 	uint64_t checkpoint_sm_obj;
 
 	/*
 	 * If there is no vdev_top_zap, we are in a pool whose
 	 * version predates the pool checkpoint feature.
 	 */
 	if (vd->vdev_top_zap == 0)
 		return;
 
 	/*
 	 * If there is no reference of the vdev_checkpoint_sm in
 	 * the vdev_top_zap, then one of the following scenarios
 	 * is true:
 	 *
 	 * 1] There is no checkpoint
 	 * 2] There is a checkpoint, but no checkpointed blocks
 	 *    have been freed yet
 	 * 3] The current vdev is indirect
 	 *
 	 * In these cases we return immediately.
 	 */
 	if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
 		return;
 
 	VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
 	    &checkpoint_sm_obj));
 
 	checkpoint_sm_exclude_entry_arg_t cseea;
 	cseea.cseea_vd = vd;
 	cseea.cseea_checkpoint_size = 0;
 
 	VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
 	    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
 
 	VERIFY0(space_map_iterate(checkpoint_sm,
 	    space_map_length(checkpoint_sm),
 	    checkpoint_sm_exclude_entry_cb, &cseea));
 	space_map_close(checkpoint_sm);
 
 	zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
 }
 
 static void
 zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
 {
 	ASSERT(!dump_opt['L']);
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
 		zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
 	}
 }
 
 static int
 count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme,
     uint64_t txg, void *arg)
 {
 	int64_t *ualloc_space = arg;
 
 	uint64_t offset = sme->sme_offset;
 	uint64_t vdev_id = sme->sme_vdev;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 	if (sme->sme_type == SM_ALLOC)
 		*ualloc_space += sme->sme_run;
 	else
 		*ualloc_space -= sme->sme_run;
 
 	return (0);
 }
 
 static int64_t
 get_unflushed_alloc_space(spa_t *spa)
 {
 	if (dump_opt['L'])
 		return (0);
 
 	int64_t ualloc_space = 0;
 	iterate_through_spacemap_logs(spa, count_unflushed_space_cb,
 	    &ualloc_space);
 	return (ualloc_space);
 }
 
 static int
 load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg)
 {
 	maptype_t *uic_maptype = arg;
 
 	uint64_t offset = sme->sme_offset;
 	uint64_t size = sme->sme_run;
 	uint64_t vdev_id = sme->sme_vdev;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 
 	/* skip indirect vdevs */
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
 	ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 	if (*uic_maptype == sme->sme_type)
 		range_tree_add(ms->ms_allocatable, offset, size);
 	else
 		range_tree_remove(ms->ms_allocatable, offset, size);
 
 	return (0);
 }
 
 static void
 load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype)
 {
 	iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype);
 }
 
 static void
 load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *vd = rvd->vdev_child[i];
 
 		ASSERT3U(i, ==, vd->vdev_id);
 
 		if (vd->vdev_ops == &vdev_indirect_ops)
 			continue;
 
 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 
 			(void) fprintf(stderr,
 			    "\rloading concrete vdev %llu, "
 			    "metaslab %llu of %llu ...",
 			    (longlong_t)vd->vdev_id,
 			    (longlong_t)msp->ms_id,
 			    (longlong_t)vd->vdev_ms_count);
 
 			mutex_enter(&msp->ms_lock);
 			range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 
 			/*
 			 * We don't want to spend the CPU manipulating the
 			 * size-ordered tree, so clear the range_tree ops.
 			 */
 			msp->ms_allocatable->rt_ops = NULL;
 
 			if (msp->ms_sm != NULL) {
 				VERIFY0(space_map_load(msp->ms_sm,
 				    msp->ms_allocatable, maptype));
 			}
 			if (!msp->ms_loaded)
 				msp->ms_loaded = B_TRUE;
 			mutex_exit(&msp->ms_lock);
 		}
 	}
 
 	load_unflushed_to_ms_allocatables(spa, maptype);
 }
 
 /*
  * vm_idxp is an in-out parameter which (for indirect vdevs) is the
  * index in vim_entries that has the first entry in this metaslab.
  * On return, it will be set to the first entry after this metaslab.
  */
 static void
 load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
     uint64_t *vim_idxp)
 {
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 	mutex_enter(&msp->ms_lock);
 	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 
 	/*
 	 * We don't want to spend the CPU manipulating the
 	 * size-ordered tree, so clear the range_tree ops.
 	 */
 	msp->ms_allocatable->rt_ops = NULL;
 
 	for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
 	    (*vim_idxp)++) {
 		vdev_indirect_mapping_entry_phys_t *vimep =
 		    &vim->vim_entries[*vim_idxp];
 		uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
 		uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
 		ASSERT3U(ent_offset, >=, msp->ms_start);
 		if (ent_offset >= msp->ms_start + msp->ms_size)
 			break;
 
 		/*
 		 * Mappings do not cross metaslab boundaries,
 		 * because we create them by walking the metaslabs.
 		 */
 		ASSERT3U(ent_offset + ent_len, <=,
 		    msp->ms_start + msp->ms_size);
 		range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
 	}
 
 	if (!msp->ms_loaded)
 		msp->ms_loaded = B_TRUE;
 	mutex_exit(&msp->ms_lock);
 }
 
 static void
 zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
 {
 	ASSERT(!dump_opt['L']);
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		ASSERT3U(c, ==, vd->vdev_id);
 
 		if (vd->vdev_ops != &vdev_indirect_ops)
 			continue;
 
 		/*
 		 * Note: we don't check for mapping leaks on
 		 * removing vdevs because their ms_allocatable's
 		 * are used to look for leaks in allocated space.
 		 */
 		zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
 
 		/*
 		 * Normally, indirect vdevs don't have any
 		 * metaslabs.  We want to set them up for
 		 * zio_claim().
 		 */
 		vdev_metaslab_group_create(vd);
 		VERIFY0(vdev_metaslab_init(vd, 0));
 
 		vdev_indirect_mapping_t *vim __maybe_unused =
 		    vd->vdev_indirect_mapping;
 		uint64_t vim_idx = 0;
 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 
 			(void) fprintf(stderr,
 			    "\rloading indirect vdev %llu, "
 			    "metaslab %llu of %llu ...",
 			    (longlong_t)vd->vdev_id,
 			    (longlong_t)vd->vdev_ms[m]->ms_id,
 			    (longlong_t)vd->vdev_ms_count);
 
 			load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
 			    &vim_idx);
 		}
 		ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
 	}
 }
 
 static void
 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
 	zcb->zcb_spa = spa;
 
 	if (dump_opt['L'])
 		return;
 
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * We are going to be changing the meaning of the metaslab's
 	 * ms_allocatable.  Ensure that the allocator doesn't try to
 	 * use the tree.
 	 */
 	spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
 	spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
 	spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops;
 
 	zcb->zcb_vd_obsolete_counts =
 	    umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
 	    UMEM_NOFAIL);
 
 	/*
 	 * For leak detection, we overload the ms_allocatable trees
 	 * to contain allocated segments instead of free segments.
 	 * As a result, we can't use the normal metaslab_load/unload
 	 * interfaces.
 	 */
 	zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
 	load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
 
 	/*
 	 * On load_concrete_ms_allocatable_trees() we loaded all the
 	 * allocated entries from the ms_sm to the ms_allocatable for
 	 * each metaslab. If the pool has a checkpoint or is in the
 	 * middle of discarding a checkpoint, some of these blocks
 	 * may have been freed but their ms_sm may not have been
 	 * updated because they are referenced by the checkpoint. In
 	 * order to avoid false-positives during leak-detection, we
 	 * go through the vdev's checkpoint space map and exclude all
 	 * its entries from their relevant ms_allocatable.
 	 *
 	 * We also aggregate the space held by the checkpoint and add
 	 * it to zcb_checkpoint_size.
 	 *
 	 * Note that at this point we are also verifying that all the
 	 * entries on the checkpoint_sm are marked as allocated in
 	 * the ms_sm of their relevant metaslab.
 	 * [see comment in checkpoint_sm_exclude_entry_cb()]
 	 */
 	zdb_leak_init_exclude_checkpoint(spa, zcb);
 	ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
 
 	/* for cleaner progress output */
 	(void) fprintf(stderr, "\n");
 
 	if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
 		ASSERT(spa_feature_is_enabled(spa,
 		    SPA_FEATURE_DEVICE_REMOVAL));
 		(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
 		    increment_indirect_mapping_cb, zcb, NULL);
 	}
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	zdb_ddt_leak_init(spa, zcb);
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 static boolean_t
 zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
 {
 	boolean_t leaks = B_FALSE;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	uint64_t total_leaked = 0;
 	boolean_t are_precise = B_FALSE;
 
 	ASSERT(vim != NULL);
 
 	for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
 		vdev_indirect_mapping_entry_phys_t *vimep =
 		    &vim->vim_entries[i];
 		uint64_t obsolete_bytes = 0;
 		uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
 		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 		/*
 		 * This is not very efficient but it's easy to
 		 * verify correctness.
 		 */
 		for (uint64_t inner_offset = 0;
 		    inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
 		    inner_offset += 1ULL << vd->vdev_ashift) {
 			if (range_tree_contains(msp->ms_allocatable,
 			    offset + inner_offset, 1ULL << vd->vdev_ashift)) {
 				obsolete_bytes += 1ULL << vd->vdev_ashift;
 			}
 		}
 
 		int64_t bytes_leaked = obsolete_bytes -
 		    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];
 		ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,
 		    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);
 
 		VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 		if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) {
 			(void) printf("obsolete indirect mapping count "
 			    "mismatch on %llu:%llx:%llx : %llx bytes leaked\n",
 			    (u_longlong_t)vd->vdev_id,
 			    (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
 			    (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
 			    (u_longlong_t)bytes_leaked);
 		}
 		total_leaked += ABS(bytes_leaked);
 	}
 
 	VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 	if (!are_precise && total_leaked > 0) {
 		int pct_leaked = total_leaked * 100 /
 		    vdev_indirect_mapping_bytes_mapped(vim);
 		(void) printf("cannot verify obsolete indirect mapping "
 		    "counts of vdev %llu because precise feature was not "
 		    "enabled when it was removed: %d%% (%llx bytes) of mapping"
 		    "unreferenced\n",
 		    (u_longlong_t)vd->vdev_id, pct_leaked,
 		    (u_longlong_t)total_leaked);
 	} else if (total_leaked > 0) {
 		(void) printf("obsolete indirect mapping count mismatch "
 		    "for vdev %llu -- %llx total bytes mismatched\n",
 		    (u_longlong_t)vd->vdev_id,
 		    (u_longlong_t)total_leaked);
 		leaks |= B_TRUE;
 	}
 
 	vdev_indirect_mapping_free_obsolete_counts(vim,
 	    zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
 	zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
 
 	return (leaks);
 }
 
 static boolean_t
 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
 {
 	if (dump_opt['L'])
 		return (B_FALSE);
 
 	boolean_t leaks = B_FALSE;
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (unsigned c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
 			leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
 		}
 
 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 			ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class ==
 			    spa_embedded_log_class(spa)) ?
 			    vd->vdev_log_mg : vd->vdev_mg);
 
 			/*
 			 * ms_allocatable has been overloaded
 			 * to contain allocated segments. Now that
 			 * we finished traversing all blocks, any
 			 * block that remains in the ms_allocatable
 			 * represents an allocated block that we
 			 * did not claim during the traversal.
 			 * Claimed blocks would have been removed
 			 * from the ms_allocatable.  For indirect
 			 * vdevs, space remaining in the tree
 			 * represents parts of the mapping that are
 			 * not referenced, which is not a bug.
 			 */
 			if (vd->vdev_ops == &vdev_indirect_ops) {
 				range_tree_vacate(msp->ms_allocatable,
 				    NULL, NULL);
 			} else {
 				range_tree_vacate(msp->ms_allocatable,
 				    zdb_leak, vd);
 			}
 			if (msp->ms_loaded) {
 				msp->ms_loaded = B_FALSE;
 			}
 		}
 	}
 
 	umem_free(zcb->zcb_vd_obsolete_counts,
 	    rvd->vdev_children * sizeof (uint32_t *));
 	zcb->zcb_vd_obsolete_counts = NULL;
 
 	return (leaks);
 }
 
 static int
 count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	(void) tx;
 	zdb_cb_t *zcb = arg;
 
 	if (dump_opt['b'] >= 5) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("[%s] %s\n",
 		    "deferred free", blkbuf);
 	}
 	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
 	return (0);
 }
 
 /*
  * Iterate over livelists which have been destroyed by the user but
  * are still present in the MOS, waiting to be freed
  */
 static void
 iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t zap_obj;
 	int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
 	if (err == ENOENT)
 		return;
 	ASSERT0(err);
 
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 	dsl_deadlist_t ll;
 	/* NULL out os prior to dsl_deadlist_open in case it's garbage */
 	ll.dl_os = NULL;
 	for (zap_cursor_init(&zc, mos, zap_obj);
 	    zap_cursor_retrieve(&zc, &attr) == 0;
 	    (void) zap_cursor_advance(&zc)) {
 		dsl_deadlist_open(&ll, mos, attr.za_first_integer);
 		func(&ll, arg);
 		dsl_deadlist_close(&ll);
 	}
 	zap_cursor_fini(&zc);
 }
 
 static int
 bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT(!bp_freed);
 	return (count_block_cb(arg, bp, tx));
 }
 
 static int
 livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle)
 {
 	zdb_cb_t *zbc = args;
 	bplist_t blks;
 	bplist_create(&blks);
 	/* determine which blocks have been alloc'd but not freed */
 	VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL));
 	/* count those blocks */
 	(void) bplist_iterate(&blks, count_block_cb, zbc, NULL);
 	bplist_destroy(&blks);
 	return (0);
 }
 
 static void
 livelist_count_blocks(dsl_deadlist_t *ll, void *arg)
 {
 	dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg);
 }
 
 /*
  * Count the blocks in the livelists that have been destroyed by the user
  * but haven't yet been freed.
  */
 static void
 deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc)
 {
 	iterate_deleted_livelists(spa, livelist_count_blocks, zbc);
 }
 
 static void
 dump_livelist_cb(dsl_deadlist_t *ll, void *arg)
 {
 	ASSERT3P(arg, ==, NULL);
 	global_feature_count[SPA_FEATURE_LIVELIST]++;
 	dump_blkptr_list(ll, "Deleted Livelist");
 	dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL);
 }
 
 /*
  * Print out, register object references to, and increment feature counts for
  * livelists that have been destroyed by the user but haven't yet been freed.
  */
 static void
 deleted_livelists_dump_mos(spa_t *spa)
 {
 	uint64_t zap_obj;
 	objset_t *mos = spa->spa_meta_objset;
 	int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
 	if (err == ENOENT)
 		return;
 	mos_obj_refd(zap_obj);
 	iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
 }
 
 static int
 zdb_brt_entry_compare(const void *zcn1, const void *zcn2)
 {
 	const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva;
 	const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva;
 	int cmp;
 
 	cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
 	if (cmp == 0)
 		cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2));
 
 	return (cmp);
 }
 
 static int
 dump_block_stats(spa_t *spa)
 {
 	zdb_cb_t *zcb;
 	zdb_blkstats_t *zb, *tzb;
 	uint64_t norm_alloc, norm_space, total_alloc, total_found;
 	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
 	    TRAVERSE_NO_DECRYPT | TRAVERSE_HARD;
 	boolean_t leaks = B_FALSE;
 	int e, c, err;
 	bp_embedded_type_t i;
 
 	zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
 		avl_create(&zcb->zcb_brt, zdb_brt_entry_compare,
 		    sizeof (zdb_brt_entry_t),
 		    offsetof(zdb_brt_entry_t, zbre_node));
 		zcb->zcb_brt_is_active = B_TRUE;
 	}
 
 	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
 	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
 	    (dump_opt['c'] == 1) ? "metadata " : "",
 	    dump_opt['c'] ? "checksums " : "",
 	    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
 	    !dump_opt['L'] ? "nothing leaked " : "");
 
 	/*
 	 * When leak detection is enabled we load all space maps as SM_ALLOC
 	 * maps, then traverse the pool claiming each block we discover. If
 	 * the pool is perfectly consistent, the segment trees will be empty
 	 * when we're done. Anything left over is a leak; any block we can't
 	 * claim (because it's not part of any space map) is a double
 	 * allocation, reference to a freed block, or an unclaimed log block.
 	 *
 	 * When leak detection is disabled (-L option) we still traverse the
 	 * pool claiming each block we discover, but we skip opening any space
 	 * maps.
 	 */
 	zdb_leak_init(spa, zcb);
 
 	/*
 	 * If there's a deferred-free bplist, process that first.
 	 */
 	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
 	    bpobj_count_block_cb, zcb, NULL);
 
 	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 		(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
 		    bpobj_count_block_cb, zcb, NULL);
 	}
 
 	zdb_claim_removing(spa, zcb);
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
 		    spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
 		    zcb, NULL));
 	}
 
 	deleted_livelists_count_blocks(spa, zcb);
 
 	if (dump_opt['c'] > 1)
 		flags |= TRAVERSE_PREFETCH_DATA;
 
 	zcb->zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
 	zcb->zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
 	zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
 	zcb->zcb_totalasize +=
 	    metaslab_class_get_alloc(spa_embedded_log_class(spa));
 	zcb->zcb_start = zcb->zcb_lastprint = gethrtime();
 	err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb);
 
 	/*
 	 * If we've traversed the data blocks then we need to wait for those
 	 * I/Os to complete. We leverage "The Godfather" zio to wait on
 	 * all async I/Os to complete.
 	 */
 	if (dump_opt['c']) {
 		for (c = 0; c < max_ncpus; c++) {
 			(void) zio_wait(spa->spa_async_zio_root[c]);
 			spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 			    ZIO_FLAG_GODFATHER);
 		}
 	}
 	ASSERT0(spa->spa_load_verify_bytes);
 
 	/*
 	 * Done after zio_wait() since zcb_haderrors is modified in
 	 * zdb_blkptr_done()
 	 */
 	zcb->zcb_haderrors |= err;
 
 	if (zcb->zcb_haderrors) {
 		(void) printf("\nError counts:\n\n");
 		(void) printf("\t%5s  %s\n", "errno", "count");
 		for (e = 0; e < 256; e++) {
 			if (zcb->zcb_errors[e] != 0) {
 				(void) printf("\t%5d  %llu\n",
 				    e, (u_longlong_t)zcb->zcb_errors[e]);
 			}
 		}
 	}
 
 	/*
 	 * Report any leaked segments.
 	 */
 	leaks |= zdb_leak_fini(spa, zcb);
 
 	tzb = &zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
 
 	norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 	norm_space = metaslab_class_get_space(spa_normal_class(spa));
 
 	total_alloc = norm_alloc +
 	    metaslab_class_get_alloc(spa_log_class(spa)) +
 	    metaslab_class_get_alloc(spa_embedded_log_class(spa)) +
 	    metaslab_class_get_alloc(spa_special_class(spa)) +
 	    metaslab_class_get_alloc(spa_dedup_class(spa)) +
 	    get_unflushed_alloc_space(spa);
 	total_found =
 	    tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize +
 	    zcb->zcb_removing_size + zcb->zcb_checkpoint_size;
 
 	if (total_found == total_alloc && !dump_opt['L']) {
 		(void) printf("\n\tNo leaks (block sum matches space"
 		    " maps exactly)\n");
 	} else if (!dump_opt['L']) {
 		(void) printf("block traversal size %llu != alloc %llu "
 		    "(%s %lld)\n",
 		    (u_longlong_t)total_found,
 		    (u_longlong_t)total_alloc,
 		    (dump_opt['L']) ? "unreachable" : "leaked",
 		    (longlong_t)(total_alloc - total_found));
 		leaks = B_TRUE;
 	}
 
 	if (tzb->zb_count == 0) {
 		umem_free(zcb, sizeof (zdb_cb_t));
 		return (2);
 	}
 
 	(void) printf("\n");
 	(void) printf("\t%-16s %14llu\n", "bp count:",
 	    (u_longlong_t)tzb->zb_count);
 	(void) printf("\t%-16s %14llu\n", "ganged count:",
 	    (longlong_t)tzb->zb_gangs);
 	(void) printf("\t%-16s %14llu      avg: %6llu\n", "bp logical:",
 	    (u_longlong_t)tzb->zb_lsize,
 	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
 	(void) printf("\t%-16s %14llu      avg: %6llu     compression: %6.2f\n",
 	    "bp physical:", (u_longlong_t)tzb->zb_psize,
 	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_psize);
 	(void) printf("\t%-16s %14llu      avg: %6llu     compression: %6.2f\n",
 	    "bp allocated:", (u_longlong_t)tzb->zb_asize,
 	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_asize);
 	(void) printf("\t%-16s %14llu    ref>1: %6llu   deduplication: %6.2f\n",
 	    "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize,
 	    (u_longlong_t)zcb->zcb_dedup_blocks,
 	    (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0);
 	(void) printf("\t%-16s %14llu    count: %6llu\n",
 	    "bp cloned:", (u_longlong_t)zcb->zcb_clone_asize,
 	    (u_longlong_t)zcb->zcb_clone_blocks);
 	(void) printf("\t%-16s %14llu     used: %5.2f%%\n", "Normal class:",
 	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
 
 	if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) {
 		uint64_t alloc = metaslab_class_get_alloc(
 		    spa_special_class(spa));
 		uint64_t space = metaslab_class_get_space(
 		    spa_special_class(spa));
 
 		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
 		    "Special class", (u_longlong_t)alloc,
 		    100.0 * alloc / space);
 	}
 
 	if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) {
 		uint64_t alloc = metaslab_class_get_alloc(
 		    spa_dedup_class(spa));
 		uint64_t space = metaslab_class_get_space(
 		    spa_dedup_class(spa));
 
 		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
 		    "Dedup class", (u_longlong_t)alloc,
 		    100.0 * alloc / space);
 	}
 
 	if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) {
 		uint64_t alloc = metaslab_class_get_alloc(
 		    spa_embedded_log_class(spa));
 		uint64_t space = metaslab_class_get_space(
 		    spa_embedded_log_class(spa));
 
 		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
 		    "Embedded log class", (u_longlong_t)alloc,
 		    100.0 * alloc / space);
 	}
 
 	for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
 		if (zcb->zcb_embedded_blocks[i] == 0)
 			continue;
 		(void) printf("\n");
 		(void) printf("\tadditional, non-pointer bps of type %u: "
 		    "%10llu\n",
 		    i, (u_longlong_t)zcb->zcb_embedded_blocks[i]);
 
 		if (dump_opt['b'] >= 3) {
 			(void) printf("\t number of (compressed) bytes:  "
 			    "number of bps\n");
 			dump_histogram(zcb->zcb_embedded_histogram[i],
 			    sizeof (zcb->zcb_embedded_histogram[i]) /
 			    sizeof (zcb->zcb_embedded_histogram[i][0]), 0);
 		}
 	}
 
 	if (tzb->zb_ditto_samevdev != 0) {
 		(void) printf("\tDittoed blocks on same vdev: %llu\n",
 		    (longlong_t)tzb->zb_ditto_samevdev);
 	}
 	if (tzb->zb_ditto_same_ms != 0) {
 		(void) printf("\tDittoed blocks in same metaslab: %llu\n",
 		    (longlong_t)tzb->zb_ditto_same_ms);
 	}
 
 	for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
 		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 		if (vim == NULL) {
 			continue;
 		}
 
 		char mem[32];
 		zdb_nicenum(vdev_indirect_mapping_num_entries(vim),
 		    mem, vdev_indirect_mapping_size(vim));
 
 		(void) printf("\tindirect vdev id %llu has %llu segments "
 		    "(%s in memory)\n",
 		    (longlong_t)vd->vdev_id,
 		    (longlong_t)vdev_indirect_mapping_num_entries(vim), mem);
 	}
 
 	if (dump_opt['b'] >= 2) {
 		int l, t, level;
 		char csize[32], lsize[32], psize[32], asize[32];
 		char avg[32], gang[32];
 		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
 		    "\t  avg\t comp\t%%Total\tType\n");
 
 		zfs_blkstat_t *mdstats = umem_zalloc(sizeof (zfs_blkstat_t),
 		    UMEM_NOFAIL);
 
 		for (t = 0; t <= ZDB_OT_TOTAL; t++) {
 			const char *typename;
 
 			/* make sure nicenum has enough space */
 			_Static_assert(sizeof (csize) >= NN_NUMBUF_SZ,
 			    "csize truncated");
 			_Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ,
 			    "lsize truncated");
 			_Static_assert(sizeof (psize) >= NN_NUMBUF_SZ,
 			    "psize truncated");
 			_Static_assert(sizeof (asize) >= NN_NUMBUF_SZ,
 			    "asize truncated");
 			_Static_assert(sizeof (avg) >= NN_NUMBUF_SZ,
 			    "avg truncated");
 			_Static_assert(sizeof (gang) >= NN_NUMBUF_SZ,
 			    "gang truncated");
 
 			if (t < DMU_OT_NUMTYPES)
 				typename = dmu_ot[t].ot_name;
 			else
 				typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
 
 			if (zcb->zcb_type[ZB_TOTAL][t].zb_asize == 0) {
 				(void) printf("%6s\t%5s\t%5s\t%5s"
 				    "\t%5s\t%5s\t%6s\t%s\n",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    typename);
 				continue;
 			}
 
 			for (l = ZB_TOTAL - 1; l >= -1; l--) {
 				level = (l == -1 ? ZB_TOTAL : l);
 				zb = &zcb->zcb_type[level][t];
 
 				if (zb->zb_asize == 0)
 					continue;
 
 				if (level != ZB_TOTAL && t < DMU_OT_NUMTYPES &&
 				    (level > 0 || DMU_OT_IS_METADATA(t))) {
 					mdstats->zb_count += zb->zb_count;
 					mdstats->zb_lsize += zb->zb_lsize;
 					mdstats->zb_psize += zb->zb_psize;
 					mdstats->zb_asize += zb->zb_asize;
 					mdstats->zb_gangs += zb->zb_gangs;
 				}
 
 				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
 					continue;
 
 				if (level == 0 && zb->zb_asize ==
 				    zcb->zcb_type[ZB_TOTAL][t].zb_asize)
 					continue;
 
 				zdb_nicenum(zb->zb_count, csize,
 				    sizeof (csize));
 				zdb_nicenum(zb->zb_lsize, lsize,
 				    sizeof (lsize));
 				zdb_nicenum(zb->zb_psize, psize,
 				    sizeof (psize));
 				zdb_nicenum(zb->zb_asize, asize,
 				    sizeof (asize));
 				zdb_nicenum(zb->zb_asize / zb->zb_count, avg,
 				    sizeof (avg));
 				zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));
 
 				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
 				    "\t%5.2f\t%6.2f\t",
 				    csize, lsize, psize, asize, avg,
 				    (double)zb->zb_lsize / zb->zb_psize,
 				    100.0 * zb->zb_asize / tzb->zb_asize);
 
 				if (level == ZB_TOTAL)
 					(void) printf("%s\n", typename);
 				else
 					(void) printf("    L%d %s\n",
 					    level, typename);
 
 				if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
 					(void) printf("\t number of ganged "
 					    "blocks: %s\n", gang);
 				}
 
 				if (dump_opt['b'] >= 4) {
 					(void) printf("psize "
 					    "(in 512-byte sectors): "
 					    "number of blocks\n");
 					dump_histogram(zb->zb_psize_histogram,
 					    PSIZE_HISTO_SIZE, 0);
 				}
 			}
 		}
 		zdb_nicenum(mdstats->zb_count, csize,
 		    sizeof (csize));
 		zdb_nicenum(mdstats->zb_lsize, lsize,
 		    sizeof (lsize));
 		zdb_nicenum(mdstats->zb_psize, psize,
 		    sizeof (psize));
 		zdb_nicenum(mdstats->zb_asize, asize,
 		    sizeof (asize));
 		zdb_nicenum(mdstats->zb_asize / mdstats->zb_count, avg,
 		    sizeof (avg));
 		zdb_nicenum(mdstats->zb_gangs, gang, sizeof (gang));
 
 		(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
 		    "\t%5.2f\t%6.2f\t",
 		    csize, lsize, psize, asize, avg,
 		    (double)mdstats->zb_lsize / mdstats->zb_psize,
 		    100.0 * mdstats->zb_asize / tzb->zb_asize);
 		(void) printf("%s\n", "Metadata Total");
 
 		/* Output a table summarizing block sizes in the pool */
 		if (dump_opt['b'] >= 2) {
 			dump_size_histograms(zcb);
 		}
 
 		umem_free(mdstats, sizeof (zfs_blkstat_t));
 	}
 
 	(void) printf("\n");
 
 	if (leaks) {
 		umem_free(zcb, sizeof (zdb_cb_t));
 		return (2);
 	}
 
 	if (zcb->zcb_haderrors) {
 		umem_free(zcb, sizeof (zdb_cb_t));
 		return (3);
 	}
 
 	umem_free(zcb, sizeof (zdb_cb_t));
 	return (0);
 }
 
 typedef struct zdb_ddt_entry {
 	/* key must be first for ddt_key_compare */
 	ddt_key_t	zdde_key;
 	uint64_t	zdde_ref_blocks;
 	uint64_t	zdde_ref_lsize;
 	uint64_t	zdde_ref_psize;
 	uint64_t	zdde_ref_dsize;
 	avl_node_t	zdde_node;
 } zdb_ddt_entry_t;
 
 static int
 zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	(void) zilog, (void) dnp;
 	avl_tree_t *t = arg;
 	avl_index_t where;
 	zdb_ddt_entry_t *zdde, zdde_search;
 
 	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
 	    BP_IS_EMBEDDED(bp))
 		return (0);
 
 	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
 		(void) printf("traversing objset %llu, %llu objects, "
 		    "%lu blocks so far\n",
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)BP_GET_FILL(bp),
 		    avl_numnodes(t));
 	}
 
 	if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
 	    BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
 		return (0);
 
 	ddt_key_fill(&zdde_search.zdde_key, bp);
 
 	zdde = avl_find(t, &zdde_search, &where);
 
 	if (zdde == NULL) {
 		zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
 		zdde->zdde_key = zdde_search.zdde_key;
 		avl_insert(t, zdde, where);
 	}
 
 	zdde->zdde_ref_blocks += 1;
 	zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
 	zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
 	zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
 
 	return (0);
 }
 
 static void
 dump_simulated_ddt(spa_t *spa)
 {
 	avl_tree_t t;
 	void *cookie = NULL;
 	zdb_ddt_entry_t *zdde;
 	ddt_histogram_t ddh_total = {{{0}}};
 	ddt_stat_t dds_total = {0};
 
 	avl_create(&t, ddt_key_compare,
 	    sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
 	    TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
 		ddt_stat_t dds;
 		uint64_t refcnt = zdde->zdde_ref_blocks;
 		ASSERT(refcnt != 0);
 
 		dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
 		dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
 		dds.dds_psize = zdde->zdde_ref_psize / refcnt;
 		dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
 
 		dds.dds_ref_blocks = zdde->zdde_ref_blocks;
 		dds.dds_ref_lsize = zdde->zdde_ref_lsize;
 		dds.dds_ref_psize = zdde->zdde_ref_psize;
 		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
 
 		ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
 		    &dds, 0);
 
 		umem_free(zdde, sizeof (*zdde));
 	}
 
 	avl_destroy(&t);
 
 	ddt_histogram_stat(&dds_total, &ddh_total);
 
 	(void) printf("Simulated DDT histogram:\n");
 
 	zpool_dump_ddt(&dds_total, &ddh_total);
 
 	dump_dedup_ratio(&dds_total);
 }
 
 static int
 verify_device_removal_feature_counts(spa_t *spa)
 {
 	uint64_t dr_feature_refcount = 0;
 	uint64_t oc_feature_refcount = 0;
 	uint64_t indirect_vdev_count = 0;
 	uint64_t precise_vdev_count = 0;
 	uint64_t obsolete_counts_object_count = 0;
 	uint64_t obsolete_sm_count = 0;
 	uint64_t obsolete_counts_count = 0;
 	uint64_t scip_count = 0;
 	uint64_t obsolete_bpobj_count = 0;
 	int ret = 0;
 
 	spa_condensing_indirect_phys_t *scip =
 	    &spa->spa_condensing_indirect_phys;
 	if (scip->scip_next_mapping_object != 0) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
 		ASSERT(scip->scip_prev_obsolete_sm_object != 0);
 		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 
 		(void) printf("Condensing indirect vdev %llu: new mapping "
 		    "object %llu, prev obsolete sm %llu\n",
 		    (u_longlong_t)scip->scip_vdev,
 		    (u_longlong_t)scip->scip_next_mapping_object,
 		    (u_longlong_t)scip->scip_prev_obsolete_sm_object);
 		if (scip->scip_prev_obsolete_sm_object != 0) {
 			space_map_t *prev_obsolete_sm = NULL;
 			VERIFY0(space_map_open(&prev_obsolete_sm,
 			    spa->spa_meta_objset,
 			    scip->scip_prev_obsolete_sm_object,
 			    0, vd->vdev_asize, 0));
 			dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
 			(void) printf("\n");
 			space_map_close(prev_obsolete_sm);
 		}
 
 		scip_count += 2;
 	}
 
 	for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		if (vic->vic_mapping_object != 0) {
 			ASSERT(vd->vdev_ops == &vdev_indirect_ops ||
 			    vd->vdev_removing);
 			indirect_vdev_count++;
 
 			if (vd->vdev_indirect_mapping->vim_havecounts) {
 				obsolete_counts_count++;
 			}
 		}
 
 		boolean_t are_precise;
 		VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 		if (are_precise) {
 			ASSERT(vic->vic_mapping_object != 0);
 			precise_vdev_count++;
 		}
 
 		uint64_t obsolete_sm_object;
 		VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 		if (obsolete_sm_object != 0) {
 			ASSERT(vic->vic_mapping_object != 0);
 			obsolete_sm_count++;
 		}
 	}
 
 	(void) feature_get_refcount(spa,
 	    &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],
 	    &dr_feature_refcount);
 	(void) feature_get_refcount(spa,
 	    &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],
 	    &oc_feature_refcount);
 
 	if (dr_feature_refcount != indirect_vdev_count) {
 		ret = 1;
 		(void) printf("Number of indirect vdevs (%llu) " \
 		    "does not match feature count (%llu)\n",
 		    (u_longlong_t)indirect_vdev_count,
 		    (u_longlong_t)dr_feature_refcount);
 	} else {
 		(void) printf("Verified device_removal feature refcount " \
 		    "of %llu is correct\n",
 		    (u_longlong_t)dr_feature_refcount);
 	}
 
 	if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_OBSOLETE_BPOBJ) == 0) {
 		obsolete_bpobj_count++;
 	}
 
 
 	obsolete_counts_object_count = precise_vdev_count;
 	obsolete_counts_object_count += obsolete_sm_count;
 	obsolete_counts_object_count += obsolete_counts_count;
 	obsolete_counts_object_count += scip_count;
 	obsolete_counts_object_count += obsolete_bpobj_count;
 	obsolete_counts_object_count += remap_deadlist_count;
 
 	if (oc_feature_refcount != obsolete_counts_object_count) {
 		ret = 1;
 		(void) printf("Number of obsolete counts objects (%llu) " \
 		    "does not match feature count (%llu)\n",
 		    (u_longlong_t)obsolete_counts_object_count,
 		    (u_longlong_t)oc_feature_refcount);
 		(void) printf("pv:%llu os:%llu oc:%llu sc:%llu "
 		    "ob:%llu rd:%llu\n",
 		    (u_longlong_t)precise_vdev_count,
 		    (u_longlong_t)obsolete_sm_count,
 		    (u_longlong_t)obsolete_counts_count,
 		    (u_longlong_t)scip_count,
 		    (u_longlong_t)obsolete_bpobj_count,
 		    (u_longlong_t)remap_deadlist_count);
 	} else {
 		(void) printf("Verified indirect_refcount feature refcount " \
 		    "of %llu is correct\n",
 		    (u_longlong_t)oc_feature_refcount);
 	}
 	return (ret);
 }
 
 static void
 zdb_set_skip_mmp(char *target)
 {
 	spa_t *spa;
 
 	/*
 	 * Disable the activity check to allow examination of
 	 * active pools.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(target)) != NULL) {
 		spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP;
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 #define	BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
 /*
  * Import the checkpointed state of the pool specified by the target
  * parameter as readonly. The function also accepts a pool config
  * as an optional parameter, else it attempts to infer the config by
  * the name of the target pool.
  *
  * Note that the checkpointed state's pool name will be the name of
  * the original pool with the above suffix appended to it. In addition,
  * if the target is not a pool name (e.g. a path to a dataset) then
  * the new_path parameter is populated with the updated path to
  * reflect the fact that we are looking into the checkpointed state.
  *
  * The function returns a newly-allocated copy of the name of the
  * pool containing the checkpointed state. When this copy is no
  * longer needed it should be freed with free(3C). Same thing
  * applies to the new_path parameter if allocated.
  */
 static char *
 import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
 {
 	int error = 0;
 	char *poolname, *bogus_name = NULL;
 	boolean_t freecfg = B_FALSE;
 
 	/* If the target is not a pool, the extract the pool name */
 	char *path_start = strchr(target, '/');
 	if (path_start != NULL) {
 		size_t poolname_len = path_start - target;
 		poolname = strndup(target, poolname_len);
 	} else {
 		poolname = target;
 	}
 
 	if (cfg == NULL) {
 		zdb_set_skip_mmp(poolname);
 		error = spa_get_stats(poolname, &cfg, NULL, 0);
 		if (error != 0) {
 			fatal("Tried to read config of pool \"%s\" but "
 			    "spa_get_stats() failed with error %d\n",
 			    poolname, error);
 		}
 		freecfg = B_TRUE;
 	}
 
 	if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) {
 		if (target != poolname)
 			free(poolname);
 		return (NULL);
 	}
 	fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
 
 	error = spa_import(bogus_name, cfg, NULL,
 	    ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT |
 	    ZFS_IMPORT_SKIP_MMP);
 	if (freecfg)
 		nvlist_free(cfg);
 	if (error != 0) {
 		fatal("Tried to import pool \"%s\" but spa_import() failed "
 		    "with error %d\n", bogus_name, error);
 	}
 
 	if (new_path != NULL && path_start != NULL) {
 		if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
 			free(bogus_name);
 			if (path_start != NULL)
 				free(poolname);
 			return (NULL);
 		}
 	}
 
 	if (target != poolname)
 		free(poolname);
 
 	return (bogus_name);
 }
 
 typedef struct verify_checkpoint_sm_entry_cb_arg {
 	vdev_t *vcsec_vd;
 
 	/* the following fields are only used for printing progress */
 	uint64_t vcsec_entryid;
 	uint64_t vcsec_num_entries;
 } verify_checkpoint_sm_entry_cb_arg_t;
 
 #define	ENTRIES_PER_PROGRESS_UPDATE 10000
 
 static int
 verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
 {
 	verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
 	vdev_t *vd = vcsec->vcsec_vd;
 	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
 	uint64_t end = sme->sme_offset + sme->sme_run;
 
 	ASSERT(sme->sme_type == SM_FREE);
 
 	if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
 		(void) fprintf(stderr,
 		    "\rverifying vdev %llu, space map entry %llu of %llu ...",
 		    (longlong_t)vd->vdev_id,
 		    (longlong_t)vcsec->vcsec_entryid,
 		    (longlong_t)vcsec->vcsec_num_entries);
 	}
 	vcsec->vcsec_entryid++;
 
 	/*
 	 * See comment in checkpoint_sm_exclude_entry_cb()
 	 */
 	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
 	 * The entries in the vdev_checkpoint_sm should be marked as
 	 * allocated in the checkpointed state of the pool, therefore
 	 * their respective ms_allocateable trees should not contain them.
 	 */
 	mutex_enter(&ms->ms_lock);
 	range_tree_verify_not_present(ms->ms_allocatable,
 	    sme->sme_offset, sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
 	return (0);
 }
 
 /*
  * Verify that all segments in the vdev_checkpoint_sm are allocated
  * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
  * ms_allocatable).
  *
  * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
  * each vdev in the current state of the pool to the metaslab space maps
  * (ms_sm) of the checkpointed state of the pool.
  *
  * Note that the function changes the state of the ms_allocatable
  * trees of the current spa_t. The entries of these ms_allocatable
  * trees are cleared out and then repopulated from with the free
  * entries of their respective ms_sm space maps.
  */
 static void
 verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
 {
 	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
 	vdev_t *current_rvd = current->spa_root_vdev;
 
 	load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
 
 	for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
 		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
 		vdev_t *current_vd = current_rvd->vdev_child[c];
 
 		space_map_t *checkpoint_sm = NULL;
 		uint64_t checkpoint_sm_obj;
 
 		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
 			/*
 			 * Since we don't allow device removal in a pool
 			 * that has a checkpoint, we expect that all removed
 			 * vdevs were removed from the pool before the
 			 * checkpoint.
 			 */
 			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
 			continue;
 		}
 
 		/*
 		 * If the checkpoint space map doesn't exist, then nothing
 		 * here is checkpointed so there's nothing to verify.
 		 */
 		if (current_vd->vdev_top_zap == 0 ||
 		    zap_contains(spa_meta_objset(current),
 		    current_vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
 			continue;
 
 		VERIFY0(zap_lookup(spa_meta_objset(current),
 		    current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
 		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
 
 		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
 		    checkpoint_sm_obj, 0, current_vd->vdev_asize,
 		    current_vd->vdev_ashift));
 
 		verify_checkpoint_sm_entry_cb_arg_t vcsec;
 		vcsec.vcsec_vd = ckpoint_vd;
 		vcsec.vcsec_entryid = 0;
 		vcsec.vcsec_num_entries =
 		    space_map_length(checkpoint_sm) / sizeof (uint64_t);
 		VERIFY0(space_map_iterate(checkpoint_sm,
 		    space_map_length(checkpoint_sm),
 		    verify_checkpoint_sm_entry_cb, &vcsec));
 		if (dump_opt['m'] > 3)
 			dump_spacemap(current->spa_meta_objset, checkpoint_sm);
 		space_map_close(checkpoint_sm);
 	}
 
 	/*
 	 * If we've added vdevs since we took the checkpoint, ensure
 	 * that their checkpoint space maps are empty.
 	 */
 	if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
 		for (uint64_t c = ckpoint_rvd->vdev_children;
 		    c < current_rvd->vdev_children; c++) {
 			vdev_t *current_vd = current_rvd->vdev_child[c];
 			VERIFY3P(current_vd->vdev_checkpoint_sm, ==, NULL);
 		}
 	}
 
 	/* for cleaner progress output */
 	(void) fprintf(stderr, "\n");
 }
 
 /*
  * Verifies that all space that's allocated in the checkpoint is
  * still allocated in the current version, by checking that everything
  * in checkpoint's ms_allocatable (which is actually allocated, not
  * allocatable/free) is not present in current's ms_allocatable.
  *
  * Note that the function changes the state of the ms_allocatable
  * trees of both spas when called. The entries of all ms_allocatable
  * trees are cleared out and then repopulated from their respective
  * ms_sm space maps. In the checkpointed state we load the allocated
  * entries, and in the current state we load the free entries.
  */
 static void
 verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
 {
 	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
 	vdev_t *current_rvd = current->spa_root_vdev;
 
 	load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
 	load_concrete_ms_allocatable_trees(current, SM_FREE);
 
 	for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
 		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
 		vdev_t *current_vd = current_rvd->vdev_child[i];
 
 		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
 			/*
 			 * See comment in verify_checkpoint_vdev_spacemaps()
 			 */
 			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
 			continue;
 		}
 
 		for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
 			metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
 			metaslab_t *current_msp = current_vd->vdev_ms[m];
 
 			(void) fprintf(stderr,
 			    "\rverifying vdev %llu of %llu, "
 			    "metaslab %llu of %llu ...",
 			    (longlong_t)current_vd->vdev_id,
 			    (longlong_t)current_rvd->vdev_children,
 			    (longlong_t)current_vd->vdev_ms[m]->ms_id,
 			    (longlong_t)current_vd->vdev_ms_count);
 
 			/*
 			 * We walk through the ms_allocatable trees that
 			 * are loaded with the allocated blocks from the
 			 * ms_sm spacemaps of the checkpoint. For each
 			 * one of these ranges we ensure that none of them
 			 * exists in the ms_allocatable trees of the
 			 * current state which are loaded with the ranges
 			 * that are currently free.
 			 *
 			 * This way we ensure that none of the blocks that
 			 * are part of the checkpoint were freed by mistake.
 			 */
 			range_tree_walk(ckpoint_msp->ms_allocatable,
 			    (range_tree_func_t *)range_tree_verify_not_present,
 			    current_msp->ms_allocatable);
 		}
 	}
 
 	/* for cleaner progress output */
 	(void) fprintf(stderr, "\n");
 }
 
 static void
 verify_checkpoint_blocks(spa_t *spa)
 {
 	ASSERT(!dump_opt['L']);
 
 	spa_t *checkpoint_spa;
 	char *checkpoint_pool;
 	int error = 0;
 
 	/*
 	 * We import the checkpointed state of the pool (under a different
 	 * name) so we can do verification on it against the current state
 	 * of the pool.
 	 */
 	checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL,
 	    NULL);
 	ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
 
 	error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
 	if (error != 0) {
 		fatal("Tried to open pool \"%s\" but spa_open() failed with "
 		    "error %d\n", checkpoint_pool, error);
 	}
 
 	/*
 	 * Ensure that ranges in the checkpoint space maps of each vdev
 	 * are allocated according to the checkpointed state's metaslab
 	 * space maps.
 	 */
 	verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
 
 	/*
 	 * Ensure that allocated ranges in the checkpoint's metaslab
 	 * space maps remain allocated in the metaslab space maps of
 	 * the current state.
 	 */
 	verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
 
 	/*
 	 * Once we are done, we get rid of the checkpointed state.
 	 */
 	spa_close(checkpoint_spa, FTAG);
 	free(checkpoint_pool);
 }
 
 static void
 dump_leftover_checkpoint_blocks(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *vd = rvd->vdev_child[i];
 
 		space_map_t *checkpoint_sm = NULL;
 		uint64_t checkpoint_sm_obj;
 
 		if (vd->vdev_top_zap == 0)
 			continue;
 
 		if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
 			continue;
 
 		VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
 		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
 
 		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
 		    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
 		dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
 		space_map_close(checkpoint_sm);
 	}
 }
 
 static int
 verify_checkpoint(spa_t *spa)
 {
 	uberblock_t checkpoint;
 	int error;
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
 		return (0);
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
 
 	if (error == ENOENT && !dump_opt['L']) {
 		/*
 		 * If the feature is active but the uberblock is missing
 		 * then we must be in the middle of discarding the
 		 * checkpoint.
 		 */
 		(void) printf("\nPartially discarded checkpoint "
 		    "state found:\n");
 		if (dump_opt['m'] > 3)
 			dump_leftover_checkpoint_blocks(spa);
 		return (0);
 	} else if (error != 0) {
 		(void) printf("lookup error %d when looking for "
 		    "checkpointed uberblock in MOS\n", error);
 		return (error);
 	}
 	dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
 
 	if (checkpoint.ub_checkpoint_txg == 0) {
 		(void) printf("\nub_checkpoint_txg not set in checkpointed "
 		    "uberblock\n");
 		error = 3;
 	}
 
 	if (error == 0 && !dump_opt['L'])
 		verify_checkpoint_blocks(spa);
 
 	return (error);
 }
 
 static void
 mos_leaks_cb(void *arg, uint64_t start, uint64_t size)
 {
 	(void) arg;
 	for (uint64_t i = start; i < size; i++) {
 		(void) printf("MOS object %llu referenced but not allocated\n",
 		    (u_longlong_t)i);
 	}
 }
 
 static void
 mos_obj_refd(uint64_t obj)
 {
 	if (obj != 0 && mos_refd_objs != NULL)
 		range_tree_add(mos_refd_objs, obj, 1);
 }
 
 /*
  * Call on a MOS object that may already have been referenced.
  */
 static void
 mos_obj_refd_multiple(uint64_t obj)
 {
 	if (obj != 0 && mos_refd_objs != NULL &&
 	    !range_tree_contains(mos_refd_objs, obj, 1))
 		range_tree_add(mos_refd_objs, obj, 1);
 }
 
 static void
 mos_leak_vdev_top_zap(vdev_t *vd)
 {
 	uint64_t ms_flush_data_obj;
 	int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
 	    sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj);
 	if (error == ENOENT)
 		return;
 	ASSERT0(error);
 
 	mos_obj_refd(ms_flush_data_obj);
 }
 
 static void
 mos_leak_vdev(vdev_t *vd)
 {
 	mos_obj_refd(vd->vdev_dtl_object);
 	mos_obj_refd(vd->vdev_ms_array);
 	mos_obj_refd(vd->vdev_indirect_config.vic_births_object);
 	mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object);
 	mos_obj_refd(vd->vdev_leaf_zap);
 	if (vd->vdev_checkpoint_sm != NULL)
 		mos_obj_refd(vd->vdev_checkpoint_sm->sm_object);
 	if (vd->vdev_indirect_mapping != NULL) {
 		mos_obj_refd(vd->vdev_indirect_mapping->
 		    vim_phys->vimp_counts_object);
 	}
 	if (vd->vdev_obsolete_sm != NULL)
 		mos_obj_refd(vd->vdev_obsolete_sm->sm_object);
 
 	for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *ms = vd->vdev_ms[m];
 		mos_obj_refd(space_map_object(ms->ms_sm));
 	}
 
 	if (vd->vdev_root_zap != 0)
 		mos_obj_refd(vd->vdev_root_zap);
 
 	if (vd->vdev_top_zap != 0) {
 		mos_obj_refd(vd->vdev_top_zap);
 		mos_leak_vdev_top_zap(vd);
 	}
 
 	for (uint64_t c = 0; c < vd->vdev_children; c++) {
 		mos_leak_vdev(vd->vdev_child[c]);
 	}
 }
 
 static void
 mos_leak_log_spacemaps(spa_t *spa)
 {
 	uint64_t spacemap_zap;
 	int error = zap_lookup(spa_meta_objset(spa),
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP,
 	    sizeof (spacemap_zap), 1, &spacemap_zap);
 	if (error == ENOENT)
 		return;
 	ASSERT0(error);
 
 	mos_obj_refd(spacemap_zap);
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls))
 		mos_obj_refd(sls->sls_sm_obj);
 }
 
 static void
 errorlog_count_refd(objset_t *mos, uint64_t errlog)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	for (zap_cursor_init(&zc, mos, errlog);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		mos_obj_refd(za.za_first_integer);
 	}
 	zap_cursor_fini(&zc);
 }
 
 static int
 dump_mos_leaks(spa_t *spa)
 {
 	int rv = 0;
 	objset_t *mos = spa->spa_meta_objset;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 
 	/* Visit and mark all referenced objects in the MOS */
 
 	mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT);
 	mos_obj_refd(spa->spa_pool_props_object);
 	mos_obj_refd(spa->spa_config_object);
 	mos_obj_refd(spa->spa_ddt_stat_object);
 	mos_obj_refd(spa->spa_feat_desc_obj);
 	mos_obj_refd(spa->spa_feat_enabled_txg_obj);
 	mos_obj_refd(spa->spa_feat_for_read_obj);
 	mos_obj_refd(spa->spa_feat_for_write_obj);
 	mos_obj_refd(spa->spa_history);
 	mos_obj_refd(spa->spa_errlog_last);
 	mos_obj_refd(spa->spa_errlog_scrub);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		errorlog_count_refd(mos, spa->spa_errlog_last);
 		errorlog_count_refd(mos, spa->spa_errlog_scrub);
 	}
 
 	mos_obj_refd(spa->spa_all_vdev_zaps);
 	mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj);
 	mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj);
 	mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj);
 	bpobj_count_refd(&spa->spa_deferred_bpobj);
 	mos_obj_refd(dp->dp_empty_bpobj);
 	bpobj_count_refd(&dp->dp_obsolete_bpobj);
 	bpobj_count_refd(&dp->dp_free_bpobj);
 	mos_obj_refd(spa->spa_l2cache.sav_object);
 	mos_obj_refd(spa->spa_spares.sav_object);
 
 	if (spa->spa_syncing_log_sm != NULL)
 		mos_obj_refd(spa->spa_syncing_log_sm->sm_object);
 	mos_leak_log_spacemaps(spa);
 
 	mos_obj_refd(spa->spa_condensing_indirect_phys.
 	    scip_next_mapping_object);
 	mos_obj_refd(spa->spa_condensing_indirect_phys.
 	    scip_prev_obsolete_sm_object);
 	if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) {
 		vdev_indirect_mapping_t *vim =
 		    vdev_indirect_mapping_open(mos,
 		    spa->spa_condensing_indirect_phys.scip_next_mapping_object);
 		mos_obj_refd(vim->vim_phys->vimp_counts_object);
 		vdev_indirect_mapping_close(vim);
 	}
 	deleted_livelists_dump_mos(spa);
 
 	if (dp->dp_origin_snap != NULL) {
 		dsl_dataset_t *ds;
 
 		dsl_pool_config_enter(dp, FTAG);
 		VERIFY0(dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,
 		    FTAG, &ds));
 		count_ds_mos_objects(ds);
 		dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
 		dsl_dataset_rele(ds, FTAG);
 		dsl_pool_config_exit(dp, FTAG);
 
 		count_ds_mos_objects(dp->dp_origin_snap);
 		dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist");
 	}
 	count_dir_mos_objects(dp->dp_mos_dir);
 	if (dp->dp_free_dir != NULL)
 		count_dir_mos_objects(dp->dp_free_dir);
 	if (dp->dp_leak_dir != NULL)
 		count_dir_mos_objects(dp->dp_leak_dir);
 
 	mos_leak_vdev(spa->spa_root_vdev);
 
 	for (uint64_t class = 0; class < DDT_CLASSES; class++) {
 		for (uint64_t type = 0; type < DDT_TYPES; type++) {
 			for (uint64_t cksum = 0;
 			    cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
 				ddt_t *ddt = spa->spa_ddt[cksum];
 				if (!ddt)
 					continue;
 				mos_obj_refd(ddt->ddt_object[type][class]);
 			}
 		}
 	}
 
 	if (spa->spa_brt != NULL) {
 		brt_t *brt = spa->spa_brt;
 		for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
 			brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
 			if (brtvd != NULL && brtvd->bv_initiated) {
 				mos_obj_refd(brtvd->bv_mos_brtvdev);
 				mos_obj_refd(brtvd->bv_mos_entries);
 			}
 		}
 	}
 
 	/*
 	 * Visit all allocated objects and make sure they are referenced.
 	 */
 	uint64_t object = 0;
 	while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) {
 		if (range_tree_contains(mos_refd_objs, object, 1)) {
 			range_tree_remove(mos_refd_objs, object, 1);
 		} else {
 			dmu_object_info_t doi;
 			const char *name;
 			VERIFY0(dmu_object_info(mos, object, &doi));
 			if (doi.doi_type & DMU_OT_NEWTYPE) {
 				dmu_object_byteswap_t bswap =
 				    DMU_OT_BYTESWAP(doi.doi_type);
 				name = dmu_ot_byteswap[bswap].ob_name;
 			} else {
 				name = dmu_ot[doi.doi_type].ot_name;
 			}
 
 			(void) printf("MOS object %llu (%s) leaked\n",
 			    (u_longlong_t)object, name);
 			rv = 2;
 		}
 	}
 	(void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL);
 	if (!range_tree_is_empty(mos_refd_objs))
 		rv = 2;
 	range_tree_vacate(mos_refd_objs, NULL, NULL);
 	range_tree_destroy(mos_refd_objs);
 	return (rv);
 }
 
 typedef struct log_sm_obsolete_stats_arg {
 	uint64_t lsos_current_txg;
 
 	uint64_t lsos_total_entries;
 	uint64_t lsos_valid_entries;
 
 	uint64_t lsos_sm_entries;
 	uint64_t lsos_valid_sm_entries;
 } log_sm_obsolete_stats_arg_t;
 
 static int
 log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme,
     uint64_t txg, void *arg)
 {
 	log_sm_obsolete_stats_arg_t *lsos = arg;
 
 	uint64_t offset = sme->sme_offset;
 	uint64_t vdev_id = sme->sme_vdev;
 
 	if (lsos->lsos_current_txg == 0) {
 		/* this is the first log */
 		lsos->lsos_current_txg = txg;
 	} else if (lsos->lsos_current_txg < txg) {
 		/* we just changed log - print stats and reset */
 		(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
 		    (u_longlong_t)lsos->lsos_valid_sm_entries,
 		    (u_longlong_t)lsos->lsos_sm_entries,
 		    (u_longlong_t)lsos->lsos_current_txg);
 		lsos->lsos_valid_sm_entries = 0;
 		lsos->lsos_sm_entries = 0;
 		lsos->lsos_current_txg = txg;
 	}
 	ASSERT3U(lsos->lsos_current_txg, ==, txg);
 
 	lsos->lsos_sm_entries++;
 	lsos->lsos_total_entries++;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 	lsos->lsos_valid_sm_entries++;
 	lsos->lsos_valid_entries++;
 	return (0);
 }
 
 static void
 dump_log_spacemap_obsolete_stats(spa_t *spa)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	log_sm_obsolete_stats_arg_t lsos = {0};
 
 	(void) printf("Log Space Map Obsolete Entry Statistics:\n");
 
 	iterate_through_spacemap_logs(spa,
 	    log_spacemap_obsolete_stats_cb, &lsos);
 
 	/* print stats for latest log */
 	(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
 	    (u_longlong_t)lsos.lsos_valid_sm_entries,
 	    (u_longlong_t)lsos.lsos_sm_entries,
 	    (u_longlong_t)lsos.lsos_current_txg);
 
 	(void) printf("%-8llu valid entries out of %-8llu - total\n\n",
 	    (u_longlong_t)lsos.lsos_valid_entries,
 	    (u_longlong_t)lsos.lsos_total_entries);
 }
 
 static void
 dump_zpool(spa_t *spa)
 {
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	int rc = 0;
 
 	if (dump_opt['y']) {
 		livelist_metaslab_validate(spa);
 	}
 
 	if (dump_opt['S']) {
 		dump_simulated_ddt(spa);
 		return;
 	}
 
 	if (!dump_opt['e'] && dump_opt['C'] > 1) {
 		(void) printf("\nCached configuration:\n");
 		dump_nvlist(spa->spa_config, 8);
 	}
 
 	if (dump_opt['C'])
 		dump_config(spa);
 
 	if (dump_opt['u'])
 		dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
 
 	if (dump_opt['D'])
 		dump_all_ddts(spa);
 
 	if (dump_opt['T'])
 		dump_brt(spa);
 
 	if (dump_opt['d'] > 2 || dump_opt['m'])
 		dump_metaslabs(spa);
 	if (dump_opt['M'])
 		dump_metaslab_groups(spa, dump_opt['M'] > 1);
 	if (dump_opt['d'] > 2 || dump_opt['m']) {
 		dump_log_spacemaps(spa);
 		dump_log_spacemap_obsolete_stats(spa);
 	}
 
 	if (dump_opt['d'] || dump_opt['i']) {
 		spa_feature_t f;
 		mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
 		    0);
 		dump_objset(dp->dp_meta_objset);
 
 		if (dump_opt['d'] >= 3) {
 			dsl_pool_t *dp = spa->spa_dsl_pool;
 			dump_full_bpobj(&spa->spa_deferred_bpobj,
 			    "Deferred frees", 0);
 			if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 				dump_full_bpobj(&dp->dp_free_bpobj,
 				    "Pool snapshot frees", 0);
 			}
 			if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
 				ASSERT(spa_feature_is_enabled(spa,
 				    SPA_FEATURE_DEVICE_REMOVAL));
 				dump_full_bpobj(&dp->dp_obsolete_bpobj,
 				    "Pool obsolete blocks", 0);
 			}
 
 			if (spa_feature_is_active(spa,
 			    SPA_FEATURE_ASYNC_DESTROY)) {
 				dump_bptree(spa->spa_meta_objset,
 				    dp->dp_bptree_obj,
 				    "Pool dataset frees");
 			}
 			dump_dtl(spa->spa_root_vdev, 0);
 		}
 
 		for (spa_feature_t f = 0; f < SPA_FEATURES; f++)
 			global_feature_count[f] = UINT64_MAX;
 		global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0;
 		global_feature_count[SPA_FEATURE_REDACTION_LIST_SPILL] = 0;
 		global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0;
 		global_feature_count[SPA_FEATURE_LIVELIST] = 0;
 
 		(void) dmu_objset_find(spa_name(spa), dump_one_objset,
 		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 
 		if (rc == 0 && !dump_opt['L'])
 			rc = dump_mos_leaks(spa);
 
 		for (f = 0; f < SPA_FEATURES; f++) {
 			uint64_t refcount;
 
 			uint64_t *arr;
 			if (!(spa_feature_table[f].fi_flags &
 			    ZFEATURE_FLAG_PER_DATASET)) {
 				if (global_feature_count[f] == UINT64_MAX)
 					continue;
 				if (!spa_feature_is_enabled(spa, f)) {
 					ASSERT0(global_feature_count[f]);
 					continue;
 				}
 				arr = global_feature_count;
 			} else {
 				if (!spa_feature_is_enabled(spa, f)) {
 					ASSERT0(dataset_feature_count[f]);
 					continue;
 				}
 				arr = dataset_feature_count;
 			}
 			if (feature_get_refcount(spa, &spa_feature_table[f],
 			    &refcount) == ENOTSUP)
 				continue;
 			if (arr[f] != refcount) {
 				(void) printf("%s feature refcount mismatch: "
 				    "%lld consumers != %lld refcount\n",
 				    spa_feature_table[f].fi_uname,
 				    (longlong_t)arr[f], (longlong_t)refcount);
 				rc = 2;
 			} else {
 				(void) printf("Verified %s feature refcount "
 				    "of %llu is correct\n",
 				    spa_feature_table[f].fi_uname,
 				    (longlong_t)refcount);
 			}
 		}
 
 		if (rc == 0)
 			rc = verify_device_removal_feature_counts(spa);
 	}
 
 	if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
 		rc = dump_block_stats(spa);
 
 	if (rc == 0)
 		rc = verify_spacemap_refcounts(spa);
 
 	if (dump_opt['s'])
 		show_pool_stats(spa);
 
 	if (dump_opt['h'])
 		dump_history(spa);
 
 	if (rc == 0)
 		rc = verify_checkpoint(spa);
 
 	if (rc != 0) {
 		dump_debug_buffer();
 		exit(rc);
 	}
 }
 
 #define	ZDB_FLAG_CHECKSUM	0x0001
 #define	ZDB_FLAG_DECOMPRESS	0x0002
 #define	ZDB_FLAG_BSWAP		0x0004
 #define	ZDB_FLAG_GBH		0x0008
 #define	ZDB_FLAG_INDIRECT	0x0010
 #define	ZDB_FLAG_RAW		0x0020
 #define	ZDB_FLAG_PRINT_BLKPTR	0x0040
 #define	ZDB_FLAG_VERBOSE	0x0080
 
 static int flagbits[256];
 static char flagbitstr[16];
 
 static void
 zdb_print_blkptr(const blkptr_t *bp, int flags)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
 
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s\n", blkbuf);
 }
 
 static void
 zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
 {
 	int i;
 
 	for (i = 0; i < nbps; i++)
 		zdb_print_blkptr(&bp[i], flags);
 }
 
 static void
 zdb_dump_gbh(void *buf, int flags)
 {
 	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
 }
 
 static void
 zdb_dump_block_raw(void *buf, uint64_t size, int flags)
 {
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array(buf, size);
 	VERIFY(write(fileno(stdout), buf, size) == size);
 }
 
 static void
 zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
 {
 	uint64_t *d = (uint64_t *)buf;
 	unsigned nwords = size / sizeof (uint64_t);
 	int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
 	unsigned i, j;
 	const char *hdr;
 	char *c;
 
 
 	if (do_bswap)
 		hdr = " 7 6 5 4 3 2 1 0   f e d c b a 9 8";
 	else
 		hdr = " 0 1 2 3 4 5 6 7   8 9 a b c d e f";
 
 	(void) printf("\n%s\n%6s   %s  0123456789abcdef\n", label, "", hdr);
 
 #ifdef _LITTLE_ENDIAN
 	/* correct the endianness */
 	do_bswap = !do_bswap;
 #endif
 	for (i = 0; i < nwords; i += 2) {
 		(void) printf("%06llx:  %016llx  %016llx  ",
 		    (u_longlong_t)(i * sizeof (uint64_t)),
 		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
 		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
 
 		c = (char *)&d[i];
 		for (j = 0; j < 2 * sizeof (uint64_t); j++)
 			(void) printf("%c", isprint(c[j]) ? c[j] : '.');
 		(void) printf("\n");
 	}
 }
 
 /*
  * There are two acceptable formats:
  *	leaf_name	  - For example: c1t0d0 or /tmp/ztest.0a
  *	child[.child]*    - For example: 0.1.1
  *
  * The second form can be used to specify arbitrary vdevs anywhere
  * in the hierarchy.  For example, in a pool with a mirror of
  * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
  */
 static vdev_t *
 zdb_vdev_lookup(vdev_t *vdev, const char *path)
 {
 	char *s, *p, *q;
 	unsigned i;
 
 	if (vdev == NULL)
 		return (NULL);
 
 	/* First, assume the x.x.x.x format */
 	i = strtoul(path, &s, 10);
 	if (s == path || (s && *s != '.' && *s != '\0'))
 		goto name;
 	if (i >= vdev->vdev_children)
 		return (NULL);
 
 	vdev = vdev->vdev_child[i];
 	if (s && *s == '\0')
 		return (vdev);
 	return (zdb_vdev_lookup(vdev, s+1));
 
 name:
 	for (i = 0; i < vdev->vdev_children; i++) {
 		vdev_t *vc = vdev->vdev_child[i];
 
 		if (vc->vdev_path == NULL) {
 			vc = zdb_vdev_lookup(vc, path);
 			if (vc == NULL)
 				continue;
 			else
 				return (vc);
 		}
 
 		p = strrchr(vc->vdev_path, '/');
 		p = p ? p + 1 : vc->vdev_path;
 		q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
 
 		if (strcmp(vc->vdev_path, path) == 0)
 			return (vc);
 		if (strcmp(p, path) == 0)
 			return (vc);
 		if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
 			return (vc);
 	}
 
 	return (NULL);
 }
 
 static int
 name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr)
 {
 	dsl_dataset_t *ds;
 
 	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
 	int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id,
 	    NULL, &ds);
 	if (error != 0) {
 		(void) fprintf(stderr, "failed to hold objset %llu: %s\n",
 		    (u_longlong_t)objset_id, strerror(error));
 		dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 		return (error);
 	}
 	dsl_dataset_name(ds, outstr);
 	dsl_dataset_rele(ds, NULL);
 	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 	return (0);
 }
 
 static boolean_t
 zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize)
 {
 	char *s0, *s1, *tmp = NULL;
 
 	if (sizes == NULL)
 		return (B_FALSE);
 
 	s0 = strtok_r(sizes, "/", &tmp);
 	if (s0 == NULL)
 		return (B_FALSE);
 	s1 = strtok_r(NULL, "/", &tmp);
 	*lsize = strtoull(s0, NULL, 16);
 	*psize = s1 ? strtoull(s1, NULL, 16) : *lsize;
 	return (*lsize >= *psize && *psize > 0);
 }
 
 #define	ZIO_COMPRESS_MASK(alg)	(1ULL << (ZIO_COMPRESS_##alg))
 
 static boolean_t
 try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize,
     int flags, int cfunc, void *lbuf, void *lbuf2)
 {
 	if (flags & ZDB_FLAG_VERBOSE) {
 		(void) fprintf(stderr,
 		    "Trying %05llx -> %05llx (%s)\n",
 		    (u_longlong_t)psize,
 		    (u_longlong_t)lsize,
 		    zio_compress_table[cfunc].ci_name);
 	}
 
 	/*
 	 * We set lbuf to all zeros and lbuf2 to all
 	 * ones, then decompress to both buffers and
 	 * compare their contents. This way we can
 	 * know if decompression filled exactly to
 	 * lsize or if it left some bytes unwritten.
 	 */
 
 	memset(lbuf, 0x00, lsize);
 	memset(lbuf2, 0xff, lsize);
 
 	if (zio_decompress_data(cfunc, pabd,
 	    lbuf, psize, lsize, NULL) == 0 &&
 	    zio_decompress_data(cfunc, pabd,
 	    lbuf2, psize, lsize, NULL) == 0 &&
 	    memcmp(lbuf, lbuf2, lsize) == 0)
 		return (B_TRUE);
 	return (B_FALSE);
 }
 
 static uint64_t
 zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize,
     uint64_t psize, int flags)
 {
 	(void) buf;
 	uint64_t orig_lsize = lsize;
 	boolean_t tryzle = ((getenv("ZDB_NO_ZLE") == NULL));
 	boolean_t found = B_FALSE;
 	/*
 	 * We don't know how the data was compressed, so just try
 	 * every decompress function at every inflated blocksize.
 	 */
 	void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 	int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 };
 	int *cfuncp = cfuncs;
 	uint64_t maxlsize = SPA_MAXBLOCKSIZE;
 	uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) |
 	    ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) |
 	    ZIO_COMPRESS_MASK(ZLE);
 	*cfuncp++ = ZIO_COMPRESS_LZ4;
 	*cfuncp++ = ZIO_COMPRESS_LZJB;
 	mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB);
 	/*
 	 * Every gzip level has the same decompressor, no need to
 	 * run it 9 times per bruteforce attempt.
 	 */
 	mask |= ZIO_COMPRESS_MASK(GZIP_2) | ZIO_COMPRESS_MASK(GZIP_3);
 	mask |= ZIO_COMPRESS_MASK(GZIP_4) | ZIO_COMPRESS_MASK(GZIP_5);
 	mask |= ZIO_COMPRESS_MASK(GZIP_6) | ZIO_COMPRESS_MASK(GZIP_7);
 	mask |= ZIO_COMPRESS_MASK(GZIP_8) | ZIO_COMPRESS_MASK(GZIP_9);
 	for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++)
 		if (((1ULL << c) & mask) == 0)
 			*cfuncp++ = c;
 
 	/*
 	 * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this
 	 * could take a while and we should let the user know
 	 * we are not stuck.  On the other hand, printing progress
 	 * info gets old after a while.  User can specify 'v' flag
 	 * to see the progression.
 	 */
 	if (lsize == psize)
 		lsize += SPA_MINBLOCKSIZE;
 	else
 		maxlsize = lsize;
 
 	for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) {
 		for (cfuncp = cfuncs; *cfuncp; cfuncp++) {
 			if (try_decompress_block(pabd, lsize, psize, flags,
 			    *cfuncp, lbuf, lbuf2)) {
 				found = B_TRUE;
 				break;
 			}
 		}
 		if (*cfuncp != 0)
 			break;
 	}
 	if (!found && tryzle) {
 		for (lsize = orig_lsize; lsize <= maxlsize;
 		    lsize += SPA_MINBLOCKSIZE) {
 			if (try_decompress_block(pabd, lsize, psize, flags,
 			    ZIO_COMPRESS_ZLE, lbuf, lbuf2)) {
 				*cfuncp = ZIO_COMPRESS_ZLE;
 				found = B_TRUE;
 				break;
 			}
 		}
 	}
 	umem_free(lbuf2, SPA_MAXBLOCKSIZE);
 
 	if (*cfuncp == ZIO_COMPRESS_ZLE) {
 		printf("\nZLE decompression was selected. If you "
 		    "suspect the results are wrong,\ntry avoiding ZLE "
 		    "by setting and exporting ZDB_NO_ZLE=\"true\"\n");
 	}
 
 	return (lsize > maxlsize ? -1 : lsize);
 }
 
 /*
  * Read a block from a pool and print it out.  The syntax of the
  * block descriptor is:
  *
  *	pool:vdev_specifier:offset:[lsize/]psize[:flags]
  *
  *	pool           - The name of the pool you wish to read from
  *	vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
  *	offset         - offset, in hex, in bytes
  *	size           - Amount of data to read, in hex, in bytes
  *	flags          - A string of characters specifying options
  *		 b: Decode a blkptr at given offset within block
  *		 c: Calculate and display checksums
  *		 d: Decompress data before dumping
  *		 e: Byteswap data before dumping
  *		 g: Display data as a gang block header
  *		 i: Display as an indirect block
  *		 r: Dump raw data to stdout
  *		 v: Verbose
  *
  */
 static void
 zdb_read_block(char *thing, spa_t *spa)
 {
 	blkptr_t blk, *bp = &blk;
 	dva_t *dva = bp->blk_dva;
 	int flags = 0;
 	uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0;
 	zio_t *zio;
 	vdev_t *vd;
 	abd_t *pabd;
 	void *lbuf, *buf;
 	char *s, *p, *dup, *flagstr, *sizes, *tmp = NULL;
 	const char *vdev, *errmsg = NULL;
 	int i, error;
 	boolean_t borrowed = B_FALSE, found = B_FALSE;
 
 	dup = strdup(thing);
 	s = strtok_r(dup, ":", &tmp);
 	vdev = s ?: "";
 	s = strtok_r(NULL, ":", &tmp);
 	offset = strtoull(s ? s : "", NULL, 16);
 	sizes = strtok_r(NULL, ":", &tmp);
 	s = strtok_r(NULL, ":", &tmp);
 	flagstr = strdup(s ?: "");
 
 	if (!zdb_parse_block_sizes(sizes, &lsize, &psize))
 		errmsg = "invalid size(s)";
 	if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE))
 		errmsg = "size must be a multiple of sector size";
 	if (!IS_P2ALIGNED(offset, DEV_BSIZE))
 		errmsg = "offset must be a multiple of sector size";
 	if (errmsg) {
 		(void) printf("Invalid block specifier: %s  - %s\n",
 		    thing, errmsg);
 		goto done;
 	}
 
 	tmp = NULL;
 	for (s = strtok_r(flagstr, ":", &tmp);
 	    s != NULL;
 	    s = strtok_r(NULL, ":", &tmp)) {
 		for (i = 0; i < strlen(flagstr); i++) {
 			int bit = flagbits[(uchar_t)flagstr[i]];
 
 			if (bit == 0) {
 				(void) printf("***Ignoring flag: %c\n",
 				    (uchar_t)flagstr[i]);
 				continue;
 			}
 			found = B_TRUE;
 			flags |= bit;
 
 			p = &flagstr[i + 1];
 			if (*p != ':' && *p != '\0') {
 				int j = 0, nextbit = flagbits[(uchar_t)*p];
 				char *end, offstr[8] = { 0 };
 				if ((bit == ZDB_FLAG_PRINT_BLKPTR) &&
 				    (nextbit == 0)) {
 					/* look ahead to isolate the offset */
 					while (nextbit == 0 &&
 					    strchr(flagbitstr, *p) == NULL) {
 						offstr[j] = *p;
 						j++;
 						if (i + j > strlen(flagstr))
 							break;
 						p++;
 						nextbit = flagbits[(uchar_t)*p];
 					}
 					blkptr_offset = strtoull(offstr, &end,
 					    16);
 					i += j;
 				} else if (nextbit == 0) {
 					(void) printf("***Ignoring flag arg:"
 					    " '%c'\n", (uchar_t)*p);
 				}
 			}
 		}
 	}
 	if (blkptr_offset % sizeof (blkptr_t)) {
 		printf("Block pointer offset 0x%llx "
 		    "must be divisible by 0x%x\n",
 		    (longlong_t)blkptr_offset, (int)sizeof (blkptr_t));
 		goto done;
 	}
 	if (found == B_FALSE && strlen(flagstr) > 0) {
 		printf("Invalid flag arg: '%s'\n", flagstr);
 		goto done;
 	}
 
 	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
 	if (vd == NULL) {
 		(void) printf("***Invalid vdev: %s\n", vdev);
 		goto done;
 	} else {
 		if (vd->vdev_path)
 			(void) fprintf(stderr, "Found vdev: %s\n",
 			    vd->vdev_path);
 		else
 			(void) fprintf(stderr, "Found vdev type: %s\n",
 			    vd->vdev_ops->vdev_op_type);
 	}
 
 	pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
 	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 
 	BP_ZERO(bp);
 
 	DVA_SET_VDEV(&dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&dva[0], offset);
 	DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
 	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
 
 	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
 
 	BP_SET_LSIZE(bp, lsize);
 	BP_SET_PSIZE(bp, psize);
 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
 	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
 	BP_SET_TYPE(bp, DMU_OT_NONE);
 	BP_SET_LEVEL(bp, 0);
 	BP_SET_DEDUP(bp, 0);
 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	zio = zio_root(spa, NULL, NULL, 0);
 
 	if (vd == vd->vdev_top) {
 		/*
 		 * Treat this as a normal block read.
 		 */
 		zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
 		    ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
 	} else {
 		/*
 		 * Treat this as a vdev child I/O.
 		 */
 		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
 		    psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL,
 		    NULL, NULL));
 	}
 
 	error = zio_wait(zio);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (error) {
 		(void) printf("Read of %s failed, error: %d\n", thing, error);
 		goto out;
 	}
 
 	uint64_t orig_lsize = lsize;
 	buf = lbuf;
 	if (flags & ZDB_FLAG_DECOMPRESS) {
 		lsize = zdb_decompress_block(pabd, buf, lbuf,
 		    lsize, psize, flags);
 		if (lsize == -1) {
 			(void) printf("Decompress of %s failed\n", thing);
 			goto out;
 		}
 	} else {
 		buf = abd_borrow_buf_copy(pabd, lsize);
 		borrowed = B_TRUE;
 	}
 	/*
 	 * Try to detect invalid block pointer.  If invalid, try
 	 * decompressing.
 	 */
 	if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) &&
 	    !(flags & ZDB_FLAG_DECOMPRESS)) {
 		const blkptr_t *b = (const blkptr_t *)(void *)
 		    ((uintptr_t)buf + (uintptr_t)blkptr_offset);
 		if (zfs_blkptr_verify(spa, b,
 		    BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) {
 			abd_return_buf_copy(pabd, buf, lsize);
 			borrowed = B_FALSE;
 			buf = lbuf;
 			lsize = zdb_decompress_block(pabd, buf,
 			    lbuf, lsize, psize, flags);
 			b = (const blkptr_t *)(void *)
 			    ((uintptr_t)buf + (uintptr_t)blkptr_offset);
 			if (lsize == -1 || zfs_blkptr_verify(spa, b,
 			    BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) {
 				printf("invalid block pointer at this DVA\n");
 				goto out;
 			}
 		}
 	}
 
 	if (flags & ZDB_FLAG_PRINT_BLKPTR)
 		zdb_print_blkptr((blkptr_t *)(void *)
 		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
 	else if (flags & ZDB_FLAG_RAW)
 		zdb_dump_block_raw(buf, lsize, flags);
 	else if (flags & ZDB_FLAG_INDIRECT)
 		zdb_dump_indirect((blkptr_t *)buf,
 		    orig_lsize / sizeof (blkptr_t), flags);
 	else if (flags & ZDB_FLAG_GBH)
 		zdb_dump_gbh(buf, flags);
 	else
 		zdb_dump_block(thing, buf, lsize, flags);
 
 	/*
 	 * If :c was specified, iterate through the checksum table to
 	 * calculate and display each checksum for our specified
 	 * DVA and length.
 	 */
 	if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) &&
 	    !(flags & ZDB_FLAG_GBH)) {
 		zio_t *czio;
 		(void) printf("\n");
 		for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL;
 		    ck < ZIO_CHECKSUM_FUNCTIONS; ck++) {
 
 			if ((zio_checksum_table[ck].ci_flags &
 			    ZCHECKSUM_FLAG_EMBEDDED) ||
 			    ck == ZIO_CHECKSUM_NOPARITY) {
 				continue;
 			}
 			BP_SET_CHECKSUM(bp, ck);
 			spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 			czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 			if (vd == vd->vdev_top) {
 				zio_nowait(zio_read(czio, spa, bp, pabd, psize,
 				    NULL, NULL,
 				    ZIO_PRIORITY_SYNC_READ,
 				    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
 				    ZIO_FLAG_DONT_RETRY, NULL));
 			} else {
 				zio_nowait(zio_vdev_child_io(czio, bp, vd,
 				    offset, pabd, psize, ZIO_TYPE_READ,
 				    ZIO_PRIORITY_SYNC_READ,
 				    ZIO_FLAG_DONT_PROPAGATE |
 				    ZIO_FLAG_DONT_RETRY |
 				    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
 				    ZIO_FLAG_SPECULATIVE |
 				    ZIO_FLAG_OPTIONAL, NULL, NULL));
 			}
 			error = zio_wait(czio);
 			if (error == 0 || error == ECKSUM) {
 				zio_t *ck_zio = zio_null(NULL, spa, NULL,
 				    NULL, NULL, 0);
 				ck_zio->io_offset =
 				    DVA_GET_OFFSET(&bp->blk_dva[0]);
 				ck_zio->io_bp = bp;
 				zio_checksum_compute(ck_zio, ck, pabd, lsize);
 				printf(
 				    "%12s\t"
 				    "cksum=%016llx:%016llx:%016llx:%016llx\n",
 				    zio_checksum_table[ck].ci_name,
 				    (u_longlong_t)bp->blk_cksum.zc_word[0],
 				    (u_longlong_t)bp->blk_cksum.zc_word[1],
 				    (u_longlong_t)bp->blk_cksum.zc_word[2],
 				    (u_longlong_t)bp->blk_cksum.zc_word[3]);
 				zio_wait(ck_zio);
 			} else {
 				printf("error %d reading block\n", error);
 			}
 			spa_config_exit(spa, SCL_STATE, FTAG);
 		}
 	}
 
 	if (borrowed)
 		abd_return_buf_copy(pabd, buf, lsize);
 
 out:
 	abd_free(pabd);
 	umem_free(lbuf, SPA_MAXBLOCKSIZE);
 done:
 	free(flagstr);
 	free(dup);
 }
 
 static void
 zdb_embedded_block(char *thing)
 {
 	blkptr_t bp = {{{{0}}}};
 	unsigned long long *words = (void *)&bp;
 	char *buf;
 	int err;
 
 	err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
 	    "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
 	    words + 0, words + 1, words + 2, words + 3,
 	    words + 4, words + 5, words + 6, words + 7,
 	    words + 8, words + 9, words + 10, words + 11,
 	    words + 12, words + 13, words + 14, words + 15);
 	if (err != 16) {
 		(void) fprintf(stderr, "invalid input format\n");
 		exit(1);
 	}
 	ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
 	buf = malloc(SPA_MAXBLOCKSIZE);
 	if (buf == NULL) {
 		(void) fprintf(stderr, "out of memory\n");
 		exit(1);
 	}
 	err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
 	if (err != 0) {
 		(void) fprintf(stderr, "decode failed: %u\n", err);
 		exit(1);
 	}
 	zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
 	free(buf);
 }
 
 /* check for valid hex or decimal numeric string */
 static boolean_t
 zdb_numeric(char *str)
 {
 	int i = 0;
 
 	if (strlen(str) == 0)
 		return (B_FALSE);
 	if (strncmp(str, "0x", 2) == 0 || strncmp(str, "0X", 2) == 0)
 		i = 2;
 	for (; i < strlen(str); i++) {
 		if (!isxdigit(str[i]))
 			return (B_FALSE);
 	}
 	return (B_TRUE);
 }
 
 int
 main(int argc, char **argv)
 {
 	int c;
 	spa_t *spa = NULL;
 	objset_t *os = NULL;
 	int dump_all = 1;
 	int verbose = 0;
 	int error = 0;
 	char **searchdirs = NULL;
 	int nsearch = 0;
 	char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN];
 	nvlist_t *policy = NULL;
 	uint64_t max_txg = UINT64_MAX;
 	int64_t objset_id = -1;
 	uint64_t object;
 	int flags = ZFS_IMPORT_MISSING_LOG;
 	int rewind = ZPOOL_NEVER_REWIND;
 	char *spa_config_path_env, *objset_str;
 	boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE;
 	nvlist_t *cfg = NULL;
 
 	dprintf_setup(&argc, argv);
 
 	/*
 	 * If there is an environment variable SPA_CONFIG_PATH it overrides
 	 * default spa_config_path setting. If -U flag is specified it will
 	 * override this environment variable settings once again.
 	 */
 	spa_config_path_env = getenv("SPA_CONFIG_PATH");
 	if (spa_config_path_env != NULL)
 		spa_config_path = spa_config_path_env;
 
 	/*
 	 * For performance reasons, we set this tunable down. We do so before
 	 * the arg parsing section so that the user can override this value if
 	 * they choose.
 	 */
 	zfs_btree_verify_intensity = 3;
 
 	struct option long_options[] = {
 		{"ignore-assertions",	no_argument,		NULL, 'A'},
 		{"block-stats",		no_argument,		NULL, 'b'},
 		{"backup",		no_argument,		NULL, 'B'},
 		{"checksum",		no_argument,		NULL, 'c'},
 		{"config",		no_argument,		NULL, 'C'},
 		{"datasets",		no_argument,		NULL, 'd'},
 		{"dedup-stats",		no_argument,		NULL, 'D'},
 		{"exported",		no_argument,		NULL, 'e'},
 		{"embedded-block-pointer",	no_argument,	NULL, 'E'},
 		{"automatic-rewind",	no_argument,		NULL, 'F'},
 		{"dump-debug-msg",	no_argument,		NULL, 'G'},
 		{"history",		no_argument,		NULL, 'h'},
 		{"intent-logs",		no_argument,		NULL, 'i'},
 		{"inflight",		required_argument,	NULL, 'I'},
 		{"checkpointed-state",	no_argument,		NULL, 'k'},
 		{"key",			required_argument,	NULL, 'K'},
 		{"label",		no_argument,		NULL, 'l'},
 		{"disable-leak-tracking",	no_argument,	NULL, 'L'},
 		{"metaslabs",		no_argument,		NULL, 'm'},
 		{"metaslab-groups",	no_argument,		NULL, 'M'},
 		{"numeric",		no_argument,		NULL, 'N'},
 		{"option",		required_argument,	NULL, 'o'},
 		{"object-lookups",	no_argument,		NULL, 'O'},
 		{"path",		required_argument,	NULL, 'p'},
 		{"parseable",		no_argument,		NULL, 'P'},
 		{"skip-label",		no_argument,		NULL, 'q'},
 		{"copy-object",		no_argument,		NULL, 'r'},
 		{"read-block",		no_argument,		NULL, 'R'},
 		{"io-stats",		no_argument,		NULL, 's'},
 		{"simulate-dedup",	no_argument,		NULL, 'S'},
 		{"txg",			required_argument,	NULL, 't'},
 		{"brt-stats",		no_argument,		NULL, 'T'},
 		{"uberblock",		no_argument,		NULL, 'u'},
 		{"cachefile",		required_argument,	NULL, 'U'},
 		{"verbose",		no_argument,		NULL, 'v'},
 		{"verbatim",		no_argument,		NULL, 'V'},
 		{"dump-blocks",		required_argument,	NULL, 'x'},
 		{"extreme-rewind",	no_argument,		NULL, 'X'},
 		{"all-reconstruction",	no_argument,		NULL, 'Y'},
 		{"livelist",		no_argument,		NULL, 'y'},
 		{"zstd-headers",	no_argument,		NULL, 'Z'},
 		{0, 0, 0, 0}
 	};
 
 	while ((c = getopt_long(argc, argv,
 	    "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ",
 	    long_options, NULL)) != -1) {
 		switch (c) {
 		case 'b':
 		case 'B':
 		case 'c':
 		case 'C':
 		case 'd':
 		case 'D':
 		case 'E':
 		case 'G':
 		case 'h':
 		case 'i':
 		case 'l':
 		case 'm':
 		case 'M':
 		case 'N':
 		case 'O':
 		case 'r':
 		case 'R':
 		case 's':
 		case 'S':
 		case 'T':
 		case 'u':
 		case 'y':
 		case 'Z':
 			dump_opt[c]++;
 			dump_all = 0;
 			break;
 		case 'A':
 		case 'e':
 		case 'F':
 		case 'k':
 		case 'L':
 		case 'P':
 		case 'q':
 		case 'X':
 			dump_opt[c]++;
 			break;
 		case 'Y':
 			zfs_reconstruct_indirect_combinations_max = INT_MAX;
 			zfs_deadman_enabled = 0;
 			break;
 		/* NB: Sort single match options below. */
 		case 'I':
 			max_inflight_bytes = strtoull(optarg, NULL, 0);
 			if (max_inflight_bytes == 0) {
 				(void) fprintf(stderr, "maximum number "
 				    "of inflight bytes must be greater "
 				    "than 0\n");
 				usage();
 			}
 			break;
 		case 'K':
 			dump_opt[c]++;
 			key_material = strdup(optarg);
 			/* redact key material in process table */
 			while (*optarg != '\0') { *optarg++ = '*'; }
 			break;
 		case 'o':
 			error = set_global_var(optarg);
 			if (error != 0)
 				usage();
 			break;
 		case 'p':
 			if (searchdirs == NULL) {
 				searchdirs = umem_alloc(sizeof (char *),
 				    UMEM_NOFAIL);
 			} else {
 				char **tmp = umem_alloc((nsearch + 1) *
 				    sizeof (char *), UMEM_NOFAIL);
 				memcpy(tmp, searchdirs, nsearch *
 				    sizeof (char *));
 				umem_free(searchdirs,
 				    nsearch * sizeof (char *));
 				searchdirs = tmp;
 			}
 			searchdirs[nsearch++] = optarg;
 			break;
 		case 't':
 			max_txg = strtoull(optarg, NULL, 0);
 			if (max_txg < TXG_INITIAL) {
 				(void) fprintf(stderr, "incorrect txg "
 				    "specified: %s\n", optarg);
 				usage();
 			}
 			break;
 		case 'U':
 			spa_config_path = optarg;
 			if (spa_config_path[0] != '/') {
 				(void) fprintf(stderr,
 				    "cachefile must be an absolute path "
 				    "(i.e. start with a slash)\n");
 				usage();
 			}
 			break;
 		case 'v':
 			verbose++;
 			break;
 		case 'V':
 			flags = ZFS_IMPORT_VERBATIM;
 			break;
 		case 'x':
 			vn_dumpdir = optarg;
 			break;
 		default:
 			usage();
 			break;
 		}
 	}
 
 	if (!dump_opt['e'] && searchdirs != NULL) {
 		(void) fprintf(stderr, "-p option requires use of -e\n");
 		usage();
 	}
 #if defined(_LP64)
 	/*
 	 * ZDB does not typically re-read blocks; therefore limit the ARC
 	 * to 256 MB, which can be used entirely for metadata.
 	 */
 	zfs_arc_min = 2ULL << SPA_MAXBLOCKSHIFT;
 	zfs_arc_max = 256 * 1024 * 1024;
 #endif
 
 	/*
 	 * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
 	 * "zdb -b" uses traversal prefetch which uses async reads.
 	 * For good performance, let several of them be active at once.
 	 */
 	zfs_vdev_async_read_max_active = 10;
 
 	/*
 	 * Disable reference tracking for better performance.
 	 */
 	reference_tracking_enable = B_FALSE;
 
 	/*
 	 * Do not fail spa_load when spa_load_verify fails. This is needed
 	 * to load non-idle pools.
 	 */
 	spa_load_verify_dryrun = B_TRUE;
 
 	/*
 	 * ZDB should have ability to read spacemaps.
 	 */
 	spa_mode_readable_spacemaps = B_TRUE;
 
 	kernel_init(SPA_MODE_READ);
 
 	if (dump_all)
 		verbose = MAX(verbose, 1);
 
 	for (c = 0; c < 256; c++) {
 		if (dump_all && strchr("ABeEFkKlLNOPrRSXy", c) == NULL)
 			dump_opt[c] = 1;
 		if (dump_opt[c])
 			dump_opt[c] += verbose;
 	}
 
 	libspl_set_assert_ok((dump_opt['A'] == 1) || (dump_opt['A'] > 2));
 	zfs_recover = (dump_opt['A'] > 1);
 
 	argc -= optind;
 	argv += optind;
 	if (argc < 2 && dump_opt['R'])
 		usage();
 
 	if (dump_opt['E']) {
 		if (argc != 1)
 			usage();
 		zdb_embedded_block(argv[0]);
 		return (0);
 	}
 
 	if (argc < 1) {
 		if (!dump_opt['e'] && dump_opt['C']) {
 			dump_cachefile(spa_config_path);
 			return (0);
 		}
 		usage();
 	}
 
 	if (dump_opt['l'])
 		return (dump_label(argv[0]));
 
 	if (dump_opt['X'] || dump_opt['F'])
 		rewind = ZPOOL_DO_REWIND |
 		    (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
 
 	/* -N implies -d */
 	if (dump_opt['N'] && dump_opt['d'] == 0)
 		dump_opt['d'] = dump_opt['N'];
 
 	if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
 	    nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 ||
 	    nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0)
 		fatal("internal error: %s", strerror(ENOMEM));
 
 	error = 0;
 	target = argv[0];
 
 	if (strpbrk(target, "/@") != NULL) {
 		size_t targetlen;
 
 		target_pool = strdup(target);
 		*strpbrk(target_pool, "/@") = '\0';
 
 		target_is_spa = B_FALSE;
 		targetlen = strlen(target);
 		if (targetlen && target[targetlen - 1] == '/')
 			target[targetlen - 1] = '\0';
 
 		/*
 		 * See if an objset ID was supplied (-d <pool>/<objset ID>).
 		 * To disambiguate tank/100, consider the 100 as objsetID
 		 * if -N was given, otherwise 100 is an objsetID iff
 		 * tank/100 as a named dataset fails on lookup.
 		 */
 		objset_str = strchr(target, '/');
 		if (objset_str && strlen(objset_str) > 1 &&
 		    zdb_numeric(objset_str + 1)) {
 			char *endptr;
 			errno = 0;
 			objset_str++;
 			objset_id = strtoull(objset_str, &endptr, 0);
 			/* dataset 0 is the same as opening the pool */
 			if (errno == 0 && endptr != objset_str &&
 			    objset_id != 0) {
 				if (dump_opt['N'])
 					dataset_lookup = B_TRUE;
 			}
 			/* normal dataset name not an objset ID */
 			if (endptr == objset_str) {
 				objset_id = -1;
 			}
 		} else if (objset_str && !zdb_numeric(objset_str + 1) &&
 		    dump_opt['N']) {
 			printf("Supply a numeric objset ID with -N\n");
 			exit(1);
 		}
 	} else {
 		target_pool = target;
 	}
 
 	if (dump_opt['e']) {
 		importargs_t args = { 0 };
 
 		args.paths = nsearch;
 		args.path = searchdirs;
 		args.can_be_active = B_TRUE;
 
 		libpc_handle_t lpch = {
 			.lpc_lib_handle = NULL,
 			.lpc_ops = &libzpool_config_ops,
 			.lpc_printerr = B_TRUE
 		};
 		error = zpool_find_config(&lpch, target_pool, &cfg, &args);
 
 		if (error == 0) {
 
 			if (nvlist_add_nvlist(cfg,
 			    ZPOOL_LOAD_POLICY, policy) != 0) {
 				fatal("can't open '%s': %s",
 				    target, strerror(ENOMEM));
 			}
 
 			if (dump_opt['C'] > 1) {
 				(void) printf("\nConfiguration for import:\n");
 				dump_nvlist(cfg, 8);
 			}
 
 			/*
 			 * Disable the activity check to allow examination of
 			 * active pools.
 			 */
 			error = spa_import(target_pool, cfg, NULL,
 			    flags | ZFS_IMPORT_SKIP_MMP);
 		}
 	}
 
 	if (searchdirs != NULL) {
 		umem_free(searchdirs, nsearch * sizeof (char *));
 		searchdirs = NULL;
 	}
 
 	/*
 	 * We need to make sure to process -O option or call
 	 * dump_path after the -e option has been processed,
 	 * which imports the pool to the namespace if it's
 	 * not in the cachefile.
 	 */
 	if (dump_opt['O']) {
 		if (argc != 2)
 			usage();
 		dump_opt['v'] = verbose + 3;
 		return (dump_path(argv[0], argv[1], NULL));
 	}
 
 	if (dump_opt['r']) {
 		target_is_spa = B_FALSE;
 		if (argc != 3)
 			usage();
 		dump_opt['v'] = verbose;
 		error = dump_path(argv[0], argv[1], &object);
 		if (error != 0)
 			fatal("internal error: %s", strerror(error));
 	}
 
 	/*
 	 * import_checkpointed_state makes the assumption that the
 	 * target pool that we pass it is already part of the spa
 	 * namespace. Because of that we need to make sure to call
 	 * it always after the -e option has been processed, which
 	 * imports the pool to the namespace if it's not in the
 	 * cachefile.
 	 */
 	char *checkpoint_pool = NULL;
 	char *checkpoint_target = NULL;
 	if (dump_opt['k']) {
 		checkpoint_pool = import_checkpointed_state(target, cfg,
 		    &checkpoint_target);
 
 		if (checkpoint_target != NULL)
 			target = checkpoint_target;
 	}
 
 	if (cfg != NULL) {
 		nvlist_free(cfg);
 		cfg = NULL;
 	}
 
 	if (target_pool != target)
 		free(target_pool);
 
 	if (error == 0) {
 		if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {
 			ASSERT(checkpoint_pool != NULL);
 			ASSERT(checkpoint_target == NULL);
 
 			error = spa_open(checkpoint_pool, &spa, FTAG);
 			if (error != 0) {
 				fatal("Tried to open pool \"%s\" but "
 				    "spa_open() failed with error %d\n",
 				    checkpoint_pool, error);
 			}
 
 		} else if (target_is_spa || dump_opt['R'] || dump_opt['B'] ||
 		    objset_id == 0) {
 			zdb_set_skip_mmp(target);
 			error = spa_open_rewind(target, &spa, FTAG, policy,
 			    NULL);
 			if (error) {
 				/*
 				 * If we're missing the log device then
 				 * try opening the pool after clearing the
 				 * log state.
 				 */
 				mutex_enter(&spa_namespace_lock);
 				if ((spa = spa_lookup(target)) != NULL &&
 				    spa->spa_log_state == SPA_LOG_MISSING) {
 					spa->spa_log_state = SPA_LOG_CLEAR;
 					error = 0;
 				}
 				mutex_exit(&spa_namespace_lock);
 
 				if (!error) {
 					error = spa_open_rewind(target, &spa,
 					    FTAG, policy, NULL);
 				}
 			}
 		} else if (strpbrk(target, "#") != NULL) {
 			dsl_pool_t *dp;
 			error = dsl_pool_hold(target, FTAG, &dp);
 			if (error != 0) {
 				fatal("can't dump '%s': %s", target,
 				    strerror(error));
 			}
 			error = dump_bookmark(dp, target, B_TRUE, verbose > 1);
 			dsl_pool_rele(dp, FTAG);
 			if (error != 0) {
 				fatal("can't dump '%s': %s", target,
 				    strerror(error));
 			}
 			return (error);
 		} else {
 			target_pool = strdup(target);
 			if (strpbrk(target, "/@") != NULL)
 				*strpbrk(target_pool, "/@") = '\0';
 
 			zdb_set_skip_mmp(target);
 			/*
 			 * If -N was supplied, the user has indicated that
 			 * zdb -d <pool>/<objsetID> is in effect.  Otherwise
 			 * we first assume that the dataset string is the
 			 * dataset name.  If dmu_objset_hold fails with the
 			 * dataset string, and we have an objset_id, retry the
 			 * lookup with the objsetID.
 			 */
 			boolean_t retry = B_TRUE;
 retry_lookup:
 			if (dataset_lookup == B_TRUE) {
 				/*
 				 * Use the supplied id to get the name
 				 * for open_objset.
 				 */
 				error = spa_open(target_pool, &spa, FTAG);
 				if (error == 0) {
 					error = name_from_objset_id(spa,
 					    objset_id, dsname);
 					spa_close(spa, FTAG);
 					if (error == 0)
 						target = dsname;
 				}
 			}
 			if (error == 0) {
 				if (objset_id > 0 && retry) {
 					int err = dmu_objset_hold(target, FTAG,
 					    &os);
 					if (err) {
 						dataset_lookup = B_TRUE;
 						retry = B_FALSE;
 						goto retry_lookup;
 					} else {
 						dmu_objset_rele(os, FTAG);
 					}
 				}
 				error = open_objset(target, FTAG, &os);
 			}
 			if (error == 0)
 				spa = dmu_objset_spa(os);
 			free(target_pool);
 		}
 	}
 	nvlist_free(policy);
 
 	if (error)
 		fatal("can't open '%s': %s", target, strerror(error));
 
 	/*
 	 * Set the pool failure mode to panic in order to prevent the pool
 	 * from suspending.  A suspended I/O will have no way to resume and
 	 * can prevent the zdb(8) command from terminating as expected.
 	 */
 	if (spa != NULL)
 		spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
 
 	argv++;
 	argc--;
 	if (dump_opt['r']) {
 		error = zdb_copy_object(os, object, argv[1]);
 	} else if (!dump_opt['R']) {
 		flagbits['d'] = ZOR_FLAG_DIRECTORY;
 		flagbits['f'] = ZOR_FLAG_PLAIN_FILE;
 		flagbits['m'] = ZOR_FLAG_SPACE_MAP;
 		flagbits['z'] = ZOR_FLAG_ZAP;
 		flagbits['A'] = ZOR_FLAG_ALL_TYPES;
 
 		if (argc > 0 && dump_opt['d']) {
 			zopt_object_args = argc;
 			zopt_object_ranges = calloc(zopt_object_args,
 			    sizeof (zopt_object_range_t));
 			for (unsigned i = 0; i < zopt_object_args; i++) {
 				int err;
 				const char *msg = NULL;
 
 				err = parse_object_range(argv[i],
 				    &zopt_object_ranges[i], &msg);
 				if (err != 0)
 					fatal("Bad object or range: '%s': %s\n",
 					    argv[i], msg ?: "");
 			}
 		} else if (argc > 0 && dump_opt['m']) {
 			zopt_metaslab_args = argc;
 			zopt_metaslab = calloc(zopt_metaslab_args,
 			    sizeof (uint64_t));
 			for (unsigned i = 0; i < zopt_metaslab_args; i++) {
 				errno = 0;
 				zopt_metaslab[i] = strtoull(argv[i], NULL, 0);
 				if (zopt_metaslab[i] == 0 && errno != 0)
 					fatal("bad number %s: %s", argv[i],
 					    strerror(errno));
 			}
 		}
 		if (dump_opt['B']) {
 			dump_backup(target, objset_id,
 			    argc > 0 ? argv[0] : NULL);
 		} else if (os != NULL) {
 			dump_objset(os);
 		} else if (zopt_object_args > 0 && !dump_opt['m']) {
 			dump_objset(spa->spa_meta_objset);
 		} else {
 			dump_zpool(spa);
 		}
 	} else {
 		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
 		flagbits['c'] = ZDB_FLAG_CHECKSUM;
 		flagbits['d'] = ZDB_FLAG_DECOMPRESS;
 		flagbits['e'] = ZDB_FLAG_BSWAP;
 		flagbits['g'] = ZDB_FLAG_GBH;
 		flagbits['i'] = ZDB_FLAG_INDIRECT;
 		flagbits['r'] = ZDB_FLAG_RAW;
 		flagbits['v'] = ZDB_FLAG_VERBOSE;
 
 		for (int i = 0; i < argc; i++)
 			zdb_read_block(argv[i], spa);
 	}
 
 	if (dump_opt['k']) {
 		free(checkpoint_pool);
 		if (!target_is_spa)
 			free(checkpoint_target);
 	}
 
 	if (os != NULL) {
 		close_objset(os, FTAG);
 	} else {
 		spa_close(spa, FTAG);
 	}
 
 	fuid_table_destroy();
 
 	dump_debug_buffer();
 
 	kernel_fini();
 
 	return (error);
 }
diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c
index 63d95ddedc3b..e3caaeb70e14 100644
--- a/cmd/zdb/zdb_il.c
+++ b/cmd/zdb/zdb_il.c
@@ -1,541 +1,541 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Copyright (c) 2012 Cyril Plisko. All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  */
 
 /*
  * Print intent log header and statistics.
  */
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/stat.h>
 #include <sys/resource.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/spa_impl.h>
 #include <sys/abd.h>
 
 #include "zdb.h"
 
 extern uint8_t dump_opt[256];
 
 static char tab_prefix[4] = "\t\t\t";
 
 static void
 print_log_bp(const blkptr_t *bp, const char *prefix)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s%s\n", prefix, blkbuf);
 }
 
 static void
 zil_prt_rec_create(zilog_t *zilog, int txtype, const void *arg)
 {
 	(void) zilog;
 	const lr_create_t *lr = arg;
 	time_t crtime = lr->lr_crtime[0];
 	char *name, *link;
 	lr_attr_t *lrattr;
 
 	name = (char *)(lr + 1);
 
 	if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR ||
 	    lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) {
 		lrattr = (lr_attr_t *)(lr + 1);
 		name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
 	}
 
 	if (txtype == TX_SYMLINK) {
 		link = name + strlen(name) + 1;
 		(void) printf("%s%s -> %s\n", tab_prefix, name, link);
 	} else if (txtype != TX_MKXATTR) {
 		(void) printf("%s%s\n", tab_prefix, name);
 	}
 
 	(void) printf("%s%s", tab_prefix, ctime(&crtime));
 	(void) printf("%sdoid %llu, foid %llu, slots %llu, mode %llo\n",
 	    tab_prefix, (u_longlong_t)lr->lr_doid,
 	    (u_longlong_t)LR_FOID_GET_OBJ(lr->lr_foid),
 	    (u_longlong_t)LR_FOID_GET_SLOTS(lr->lr_foid),
 	    (longlong_t)lr->lr_mode);
 	(void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n",
 	    tab_prefix,
 	    (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid,
 	    (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev);
 }
 
 static void
 zil_prt_rec_remove(zilog_t *zilog, int txtype, const void *arg)
 {
 	(void) zilog, (void) txtype;
 	const lr_remove_t *lr = arg;
 
 	(void) printf("%sdoid %llu, name %s\n", tab_prefix,
 	    (u_longlong_t)lr->lr_doid, (char *)(lr + 1));
 }
 
 static void
 zil_prt_rec_link(zilog_t *zilog, int txtype, const void *arg)
 {
 	(void) zilog, (void) txtype;
 	const lr_link_t *lr = arg;
 
 	(void) printf("%sdoid %llu, link_obj %llu, name %s\n", tab_prefix,
 	    (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj,
 	    (char *)(lr + 1));
 }
 
 static void
 zil_prt_rec_rename(zilog_t *zilog, int txtype, const void *arg)
 {
 	(void) zilog, (void) txtype;
 	const lr_rename_t *lr = arg;
 	char *snm = (char *)(lr + 1);
 	char *tnm = snm + strlen(snm) + 1;
 
 	(void) printf("%ssdoid %llu, tdoid %llu\n", tab_prefix,
 	    (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid);
 	(void) printf("%ssrc %s tgt %s\n", tab_prefix, snm, tnm);
 	switch (txtype) {
 	case TX_RENAME_EXCHANGE:
 		(void) printf("%sflags RENAME_EXCHANGE\n", tab_prefix);
 		break;
 	case TX_RENAME_WHITEOUT:
 		(void) printf("%sflags RENAME_WHITEOUT\n", tab_prefix);
 		break;
 	}
 }
 
 static int
 zil_prt_rec_write_cb(void *data, size_t len, void *unused)
 {
 	(void) unused;
 	char *cdata = data;
 
 	for (size_t i = 0; i < len; i++) {
 		if (isprint(*cdata))
 			(void) printf("%c ", *cdata);
 		else
 			(void) printf("%2X", *cdata);
 		cdata++;
 	}
 	return (0);
 }
 
 static void
 zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)
 {
 	const lr_write_t *lr = arg;
 	abd_t *data;
 	const blkptr_t *bp = &lr->lr_blkptr;
 	zbookmark_phys_t zb;
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 	int error;
 
 	(void) printf("%sfoid %llu, offset %llx, length %llx\n", tab_prefix,
 	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
 	    (u_longlong_t)lr->lr_length);
 
 	if (txtype == TX_WRITE2 || verbose < 4)
 		return;
 
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		(void) printf("%shas blkptr, %s\n", tab_prefix,
-		    !BP_IS_HOLE(bp) &&
-		    bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
+		    !BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >=
+		    spa_min_claim_txg(zilog->zl_spa) ?
 		    "will claim" : "won't claim");
 		print_log_bp(bp, tab_prefix);
 
 		if (verbose < 5)
 			return;
 		if (BP_IS_HOLE(bp)) {
 			(void) printf("\t\t\tLSIZE 0x%llx\n",
 			    (u_longlong_t)BP_GET_LSIZE(bp));
 			(void) printf("%s<hole>\n", tab_prefix);
 			return;
 		}
-		if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
+		if (BP_GET_LOGICAL_BIRTH(bp) < zilog->zl_header->zh_claim_txg) {
 			(void) printf("%s<block already committed>\n",
 			    tab_prefix);
 			return;
 		}
 
 		ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
 		SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os),
 		    lr->lr_foid, ZB_ZIL_LEVEL,
 		    lr->lr_offset / BP_GET_LSIZE(bp));
 
 		data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE);
 		error = zio_wait(zio_read(NULL, zilog->zl_spa,
 		    bp, data, BP_GET_LSIZE(bp), NULL, NULL,
 		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
 		if (error)
 			goto out;
 	} else {
 		if (verbose < 5)
 			return;
 
 		/* data is stored after the end of the lr_write record */
 		data = abd_alloc(lr->lr_length, B_FALSE);
 		abd_copy_from_buf(data, lr + 1, lr->lr_length);
 	}
 
 	(void) printf("%s", tab_prefix);
 	(void) abd_iterate_func(data,
 	    0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)),
 	    zil_prt_rec_write_cb, NULL);
 	(void) printf("\n");
 
 out:
 	abd_free(data);
 }
 
 static void
 zil_prt_rec_write_enc(zilog_t *zilog, int txtype, const void *arg)
 {
 	(void) txtype;
 	const lr_write_t *lr = arg;
 	const blkptr_t *bp = &lr->lr_blkptr;
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 
 	(void) printf("%s(encrypted)\n", tab_prefix);
 
 	if (verbose < 4)
 		return;
 
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		(void) printf("%shas blkptr, %s\n", tab_prefix,
-		    !BP_IS_HOLE(bp) &&
-		    bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
+		    !BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >=
+		    spa_min_claim_txg(zilog->zl_spa) ?
 		    "will claim" : "won't claim");
 		print_log_bp(bp, tab_prefix);
 	}
 }
 
 static void
 zil_prt_rec_truncate(zilog_t *zilog, int txtype, const void *arg)
 {
 	(void) zilog, (void) txtype;
 	const lr_truncate_t *lr = arg;
 
 	(void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", tab_prefix,
 	    (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
 	    (u_longlong_t)lr->lr_length);
 }
 
 static void
 zil_prt_rec_setattr(zilog_t *zilog, int txtype, const void *arg)
 {
 	(void) zilog, (void) txtype;
 	const lr_setattr_t *lr = arg;
 	time_t atime = (time_t)lr->lr_atime[0];
 	time_t mtime = (time_t)lr->lr_mtime[0];
 
 	(void) printf("%sfoid %llu, mask 0x%llx\n", tab_prefix,
 	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask);
 
 	if (lr->lr_mask & AT_MODE) {
 		(void) printf("%sAT_MODE  %llo\n", tab_prefix,
 		    (longlong_t)lr->lr_mode);
 	}
 
 	if (lr->lr_mask & AT_UID) {
 		(void) printf("%sAT_UID   %llu\n", tab_prefix,
 		    (u_longlong_t)lr->lr_uid);
 	}
 
 	if (lr->lr_mask & AT_GID) {
 		(void) printf("%sAT_GID   %llu\n", tab_prefix,
 		    (u_longlong_t)lr->lr_gid);
 	}
 
 	if (lr->lr_mask & AT_SIZE) {
 		(void) printf("%sAT_SIZE  %llu\n", tab_prefix,
 		    (u_longlong_t)lr->lr_size);
 	}
 
 	if (lr->lr_mask & AT_ATIME) {
 		(void) printf("%sAT_ATIME %llu.%09llu %s", tab_prefix,
 		    (u_longlong_t)lr->lr_atime[0],
 		    (u_longlong_t)lr->lr_atime[1],
 		    ctime(&atime));
 	}
 
 	if (lr->lr_mask & AT_MTIME) {
 		(void) printf("%sAT_MTIME %llu.%09llu %s", tab_prefix,
 		    (u_longlong_t)lr->lr_mtime[0],
 		    (u_longlong_t)lr->lr_mtime[1],
 		    ctime(&mtime));
 	}
 }
 
 static void
 zil_prt_rec_setsaxattr(zilog_t *zilog, int txtype, const void *arg)
 {
 	(void) zilog, (void) txtype;
 	const lr_setsaxattr_t *lr = arg;
 
 	char *name = (char *)(lr + 1);
 	(void) printf("%sfoid %llu\n", tab_prefix,
 	    (u_longlong_t)lr->lr_foid);
 
 	(void) printf("%sXAT_NAME  %s\n", tab_prefix, name);
 	if (lr->lr_size == 0) {
 		(void) printf("%sXAT_VALUE  NULL\n", tab_prefix);
 	} else {
 		(void) printf("%sXAT_VALUE  ", tab_prefix);
 		char *val = name + (strlen(name) + 1);
 		for (int i = 0; i < lr->lr_size; i++) {
 			(void) printf("%c", *val);
 			val++;
 		}
 	}
 }
 
 static void
 zil_prt_rec_acl(zilog_t *zilog, int txtype, const void *arg)
 {
 	(void) zilog, (void) txtype;
 	const lr_acl_t *lr = arg;
 
 	(void) printf("%sfoid %llu, aclcnt %llu\n", tab_prefix,
 	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt);
 }
 
 static void
 zil_prt_rec_clone_range(zilog_t *zilog, int txtype, const void *arg)
 {
 	(void) zilog, (void) txtype;
 	const lr_clone_range_t *lr = arg;
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 
 	(void) printf("%sfoid %llu, offset %llx, length %llx, blksize %llx\n",
 	    tab_prefix, (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
 	    (u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blksz);
 
 	if (verbose < 4)
 		return;
 
 	for (unsigned int i = 0; i < lr->lr_nbps; i++) {
 		(void) printf("%s[%u/%llu] ", tab_prefix, i + 1,
 		    (u_longlong_t)lr->lr_nbps);
 		print_log_bp(&lr->lr_bps[i], "");
 	}
 }
 
 static void
 zil_prt_rec_clone_range_enc(zilog_t *zilog, int txtype, const void *arg)
 {
 	(void) zilog, (void) txtype;
 	const lr_clone_range_t *lr = arg;
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 
 	(void) printf("%s(encrypted)\n", tab_prefix);
 
 	if (verbose < 4)
 		return;
 
 	for (unsigned int i = 0; i < lr->lr_nbps; i++) {
 		(void) printf("%s[%u/%llu] ", tab_prefix, i + 1,
 		    (u_longlong_t)lr->lr_nbps);
 		print_log_bp(&lr->lr_bps[i], "");
 	}
 }
 
 typedef void (*zil_prt_rec_func_t)(zilog_t *, int, const void *);
 typedef struct zil_rec_info {
 	zil_prt_rec_func_t	zri_print;
 	zil_prt_rec_func_t	zri_print_enc;
 	const char		*zri_name;
 	uint64_t		zri_count;
 } zil_rec_info_t;
 
 static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
 	{.zri_print = NULL,		    .zri_name = "Total              "},
 	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_CREATE          "},
 	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_MKDIR           "},
 	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_MKXATTR         "},
 	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_SYMLINK         "},
 	{.zri_print = zil_prt_rec_remove,   .zri_name = "TX_REMOVE          "},
 	{.zri_print = zil_prt_rec_remove,   .zri_name = "TX_RMDIR           "},
 	{.zri_print = zil_prt_rec_link,	    .zri_name = "TX_LINK            "},
 	{.zri_print = zil_prt_rec_rename,   .zri_name = "TX_RENAME          "},
 	{.zri_print = zil_prt_rec_write,
 	    .zri_print_enc = zil_prt_rec_write_enc,
 	    .zri_name = "TX_WRITE           "},
 	{.zri_print = zil_prt_rec_truncate, .zri_name = "TX_TRUNCATE        "},
 	{.zri_print = zil_prt_rec_setattr,  .zri_name = "TX_SETATTR         "},
 	{.zri_print = zil_prt_rec_acl,	    .zri_name = "TX_ACL_V0          "},
 	{.zri_print = zil_prt_rec_acl,	    .zri_name = "TX_ACL_ACL         "},
 	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_CREATE_ACL      "},
 	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_CREATE_ATTR     "},
 	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_CREATE_ACL_ATTR "},
 	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_MKDIR_ACL       "},
 	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_MKDIR_ATTR      "},
 	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_MKDIR_ACL_ATTR  "},
 	{.zri_print = zil_prt_rec_write,    .zri_name = "TX_WRITE2          "},
 	{.zri_print = zil_prt_rec_setsaxattr,
 	    .zri_name = "TX_SETSAXATTR      "},
 	{.zri_print = zil_prt_rec_rename,   .zri_name = "TX_RENAME_EXCHANGE "},
 	{.zri_print = zil_prt_rec_rename,   .zri_name = "TX_RENAME_WHITEOUT "},
 	{.zri_print = zil_prt_rec_clone_range,
 	    .zri_print_enc = zil_prt_rec_clone_range_enc,
 	    .zri_name = "TX_CLONE_RANGE     "},
 };
 
 static int
 print_log_record(zilog_t *zilog, const lr_t *lr, void *arg, uint64_t claim_txg)
 {
 	(void) arg, (void) claim_txg;
 	int txtype;
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 
 	/* reduce size of txtype to strip off TX_CI bit */
 	txtype = lr->lrc_txtype;
 
 	ASSERT(txtype != 0 && (uint_t)txtype < TX_MAX_TYPE);
 	ASSERT(lr->lrc_txg);
 
 	(void) printf("\t\t%s%s len %6llu, txg %llu, seq %llu\n",
 	    (lr->lrc_txtype & TX_CI) ? "CI-" : "",
 	    zil_rec_info[txtype].zri_name,
 	    (u_longlong_t)lr->lrc_reclen,
 	    (u_longlong_t)lr->lrc_txg,
 	    (u_longlong_t)lr->lrc_seq);
 
 	if (txtype && verbose >= 3) {
 		if (!zilog->zl_os->os_encrypted) {
 			zil_rec_info[txtype].zri_print(zilog, txtype, lr);
 		} else if (zil_rec_info[txtype].zri_print_enc) {
 			zil_rec_info[txtype].zri_print_enc(zilog, txtype, lr);
 		} else {
 			(void) printf("%s(encrypted)\n", tab_prefix);
 		}
 	}
 
 	zil_rec_info[txtype].zri_count++;
 	zil_rec_info[0].zri_count++;
 
 	return (0);
 }
 
 static int
 print_log_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
     uint64_t claim_txg)
 {
 	(void) arg;
 	char blkbuf[BP_SPRINTF_LEN + 10];
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 	const char *claim;
 
 	if (verbose <= 3)
 		return (0);
 
 	if (verbose >= 5) {
 		(void) strcpy(blkbuf, ", ");
 		snprintf_blkptr(blkbuf + strlen(blkbuf),
 		    sizeof (blkbuf) - strlen(blkbuf), bp);
 	} else {
 		blkbuf[0] = '\0';
 	}
 
 	if (claim_txg != 0)
 		claim = "already claimed";
-	else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa))
+	else if (BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(zilog->zl_spa))
 		claim = "will claim";
 	else
 		claim = "won't claim";
 
 	(void) printf("\tBlock seqno %llu, %s%s\n",
 	    (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf);
 
 	return (0);
 }
 
 static void
 print_log_stats(int verbose)
 {
 	unsigned i, w, p10;
 
 	if (verbose > 3)
 		(void) printf("\n");
 
 	if (zil_rec_info[0].zri_count == 0)
 		return;
 
 	for (w = 1, p10 = 10; zil_rec_info[0].zri_count >= p10; p10 *= 10)
 		w++;
 
 	for (i = 0; i < TX_MAX_TYPE; i++)
 		if (zil_rec_info[i].zri_count || verbose >= 3)
 			(void) printf("\t\t%s %*llu\n",
 			    zil_rec_info[i].zri_name, w,
 			    (u_longlong_t)zil_rec_info[i].zri_count);
 	(void) printf("\n");
 }
 
 void
 dump_intent_log(zilog_t *zilog)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 	int i;
 
 	if (BP_IS_HOLE(&zh->zh_log) || verbose < 1)
 		return;
 
 	(void) printf("\n    ZIL header: claim_txg %llu, "
 	    "claim_blk_seq %llu, claim_lr_seq %llu",
 	    (u_longlong_t)zh->zh_claim_txg,
 	    (u_longlong_t)zh->zh_claim_blk_seq,
 	    (u_longlong_t)zh->zh_claim_lr_seq);
 	(void) printf(" replay_seq %llu, flags 0x%llx\n",
 	    (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags);
 
 	for (i = 0; i < TX_MAX_TYPE; i++)
 		zil_rec_info[i].zri_count = 0;
 
 	/* see comment in zil_claim() or zil_check_log_chain() */
 	if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
 	    zh->zh_claim_txg == 0)
 		return;
 
 	if (verbose >= 2) {
 		(void) printf("\n");
 		(void) zil_parse(zilog, print_log_block, print_log_record, NULL,
 		    zh->zh_claim_txg, B_FALSE);
 		print_log_stats(verbose);
 	}
 }
diff --git a/cmd/zhack.c b/cmd/zhack.c
index 44611887dd25..f15a6ece538c 100644
--- a/cmd/zhack.c
+++ b/cmd/zhack.c
@@ -1,1026 +1,1026 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  */
 
 /*
  * zhack is a debugging tool that can write changes to ZFS pool using libzpool
  * for testing purposes. Altering pools with zhack is unsupported and may
  * result in corrupted pools.
  */
 
 #include <zfs_prop.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <sys/stat.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/zfs_znode.h>
 #include <sys/dsl_synctask.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_pool.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/zfeature.h>
 #include <sys/dmu_tx.h>
 #include <zfeature_common.h>
 #include <libzutil.h>
 
 static importargs_t g_importargs;
 static char *g_pool;
 static boolean_t g_readonly;
 
 typedef enum {
 	ZHACK_REPAIR_OP_UNKNOWN  = 0,
 	ZHACK_REPAIR_OP_CKSUM    = (1 << 0),
 	ZHACK_REPAIR_OP_UNDETACH = (1 << 1)
 } zhack_repair_op_t;
 
 static __attribute__((noreturn)) void
 usage(void)
 {
 	(void) fprintf(stderr,
 	    "Usage: zhack [-c cachefile] [-d dir] <subcommand> <args> ...\n"
 	    "where <subcommand> <args> is one of the following:\n"
 	    "\n");
 
 	(void) fprintf(stderr,
 	    "    feature stat <pool>\n"
 	    "        print information about enabled features\n"
 	    "    feature enable [-r] [-d desc] <pool> <feature>\n"
 	    "        add a new enabled feature to the pool\n"
 	    "        -d <desc> sets the feature's description\n"
 	    "        -r set read-only compatible flag for feature\n"
 	    "    feature ref [-md] <pool> <feature>\n"
 	    "        change the refcount on the given feature\n"
 	    "        -d decrease instead of increase the refcount\n"
 	    "        -m add the feature to the label if increasing refcount\n"
 	    "\n"
 	    "    <feature> : should be a feature guid\n"
 	    "\n"
 	    "    label repair <device>\n"
 	    "        repair labels of a specified device according to options\n"
 	    "        which may be combined to do their functions in one call\n"
 	    "        -c repair corrupted label checksums\n"
 	    "        -u restore the label on a detached device\n"
 	    "\n"
 	    "    <device> : path to vdev\n");
 	exit(1);
 }
 
 
 static __attribute__((format(printf, 3, 4))) __attribute__((noreturn)) void
 fatal(spa_t *spa, const void *tag, const char *fmt, ...)
 {
 	va_list ap;
 
 	if (spa != NULL) {
 		spa_close(spa, tag);
 		(void) spa_export(g_pool, NULL, B_TRUE, B_FALSE);
 	}
 
 	va_start(ap, fmt);
 	(void) fputs("zhack: ", stderr);
 	(void) vfprintf(stderr, fmt, ap);
 	va_end(ap);
 	(void) fputc('\n', stderr);
 
 	exit(1);
 }
 
 static int
 space_delta_cb(dmu_object_type_t bonustype, const void *data,
     zfs_file_info_t *zoi)
 {
 	(void) data, (void) zoi;
 
 	/*
 	 * Is it a valid type of object to track?
 	 */
 	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
 		return (ENOENT);
 	(void) fprintf(stderr, "modifying object that needs user accounting");
 	abort();
 }
 
 /*
  * Target is the dataset whose pool we want to open.
  */
 static void
 zhack_import(char *target, boolean_t readonly)
 {
 	nvlist_t *config;
 	nvlist_t *props;
 	int error;
 
 	kernel_init(readonly ? SPA_MODE_READ :
 	    (SPA_MODE_READ | SPA_MODE_WRITE));
 
 	dmu_objset_register_type(DMU_OST_ZFS, space_delta_cb);
 
 	g_readonly = readonly;
 	g_importargs.can_be_active = readonly;
 	g_pool = strdup(target);
 
 	libpc_handle_t lpch = {
 		.lpc_lib_handle = NULL,
 		.lpc_ops = &libzpool_config_ops,
 		.lpc_printerr = B_TRUE
 	};
 	error = zpool_find_config(&lpch, target, &config, &g_importargs);
 	if (error)
 		fatal(NULL, FTAG, "cannot import '%s'", target);
 
 	props = NULL;
 	if (readonly) {
 		VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
 		VERIFY(nvlist_add_uint64(props,
 		    zpool_prop_to_name(ZPOOL_PROP_READONLY), 1) == 0);
 	}
 
 	zfeature_checks_disable = B_TRUE;
 	error = spa_import(target, config, props,
 	    (readonly ?  ZFS_IMPORT_SKIP_MMP : ZFS_IMPORT_NORMAL));
 	fnvlist_free(config);
 	zfeature_checks_disable = B_FALSE;
 	if (error == EEXIST)
 		error = 0;
 
 	if (error)
 		fatal(NULL, FTAG, "can't import '%s': %s", target,
 		    strerror(error));
 }
 
 static void
 zhack_spa_open(char *target, boolean_t readonly, const void *tag, spa_t **spa)
 {
 	int err;
 
 	zhack_import(target, readonly);
 
 	zfeature_checks_disable = B_TRUE;
 	err = spa_open(target, spa, tag);
 	zfeature_checks_disable = B_FALSE;
 
 	if (err != 0)
 		fatal(*spa, FTAG, "cannot open '%s': %s", target,
 		    strerror(err));
 	if (spa_version(*spa) < SPA_VERSION_FEATURES) {
 		fatal(*spa, FTAG, "'%s' has version %d, features not enabled",
 		    target, (int)spa_version(*spa));
 	}
 }
 
 static void
 dump_obj(objset_t *os, uint64_t obj, const char *name)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	(void) printf("%s_obj:\n", name);
 
 	for (zap_cursor_init(&zc, os, obj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		if (za.za_integer_length == 8) {
 			ASSERT(za.za_num_integers == 1);
 			(void) printf("\t%s = %llu\n",
 			    za.za_name, (u_longlong_t)za.za_first_integer);
 		} else {
 			ASSERT(za.za_integer_length == 1);
 			char val[1024];
 			VERIFY(zap_lookup(os, obj, za.za_name,
 			    1, sizeof (val), val) == 0);
 			(void) printf("\t%s = %s\n", za.za_name, val);
 		}
 	}
 	zap_cursor_fini(&zc);
 }
 
 static void
 dump_mos(spa_t *spa)
 {
 	nvlist_t *nv = spa->spa_label_features;
 	nvpair_t *pair;
 
 	(void) printf("label config:\n");
 	for (pair = nvlist_next_nvpair(nv, NULL);
 	    pair != NULL;
 	    pair = nvlist_next_nvpair(nv, pair)) {
 		(void) printf("\t%s\n", nvpair_name(pair));
 	}
 }
 
 static void
 zhack_do_feature_stat(int argc, char **argv)
 {
 	spa_t *spa;
 	objset_t *os;
 	char *target;
 
 	argc--;
 	argv++;
 
 	if (argc < 1) {
 		(void) fprintf(stderr, "error: missing pool name\n");
 		usage();
 	}
 	target = argv[0];
 
 	zhack_spa_open(target, B_TRUE, FTAG, &spa);
 	os = spa->spa_meta_objset;
 
 	dump_obj(os, spa->spa_feat_for_read_obj, "for_read");
 	dump_obj(os, spa->spa_feat_for_write_obj, "for_write");
 	dump_obj(os, spa->spa_feat_desc_obj, "descriptions");
 	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
 		dump_obj(os, spa->spa_feat_enabled_txg_obj, "enabled_txg");
 	}
 	dump_mos(spa);
 
 	spa_close(spa, FTAG);
 }
 
 static void
 zhack_feature_enable_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	zfeature_info_t *feature = arg;
 
 	feature_enable_sync(spa, feature, tx);
 
 	spa_history_log_internal(spa, "zhack enable feature", tx,
 	    "name=%s flags=%u",
 	    feature->fi_guid, feature->fi_flags);
 }
 
 static void
 zhack_do_feature_enable(int argc, char **argv)
 {
 	int c;
 	char *desc, *target;
 	spa_t *spa;
 	objset_t *mos;
 	zfeature_info_t feature;
 	const spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
 
 	/*
 	 * Features are not added to the pool's label until their refcounts
 	 * are incremented, so fi_mos can just be left as false for now.
 	 */
 	desc = NULL;
 	feature.fi_uname = "zhack";
 	feature.fi_flags = 0;
 	feature.fi_depends = nodeps;
 	feature.fi_feature = SPA_FEATURE_NONE;
 
 	optind = 1;
 	while ((c = getopt(argc, argv, "+rd:")) != -1) {
 		switch (c) {
 		case 'r':
 			feature.fi_flags |= ZFEATURE_FLAG_READONLY_COMPAT;
 			break;
 		case 'd':
 			if (desc != NULL)
 				free(desc);
 			desc = strdup(optarg);
 			break;
 		default:
 			usage();
 			break;
 		}
 	}
 
 	if (desc == NULL)
 		desc = strdup("zhack injected");
 	feature.fi_desc = desc;
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 2) {
 		(void) fprintf(stderr, "error: missing feature or pool name\n");
 		usage();
 	}
 	target = argv[0];
 	feature.fi_guid = argv[1];
 
 	if (!zfeature_is_valid_guid(feature.fi_guid))
 		fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid);
 
 	zhack_spa_open(target, B_FALSE, FTAG, &spa);
 	mos = spa->spa_meta_objset;
 
 	if (zfeature_is_supported(feature.fi_guid))
 		fatal(spa, FTAG, "'%s' is a real feature, will not enable",
 		    feature.fi_guid);
 	if (0 == zap_contains(mos, spa->spa_feat_desc_obj, feature.fi_guid))
 		fatal(spa, FTAG, "feature already enabled: %s",
 		    feature.fi_guid);
 
 	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
 	    zhack_feature_enable_sync, &feature, 5, ZFS_SPACE_CHECK_NORMAL));
 
 	spa_close(spa, FTAG);
 
 	free(desc);
 }
 
 static void
 feature_incr_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	zfeature_info_t *feature = arg;
 	uint64_t refcount;
 
 	VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount));
 	feature_sync(spa, feature, refcount + 1, tx);
 	spa_history_log_internal(spa, "zhack feature incr", tx,
 	    "name=%s", feature->fi_guid);
 }
 
 static void
 feature_decr_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	zfeature_info_t *feature = arg;
 	uint64_t refcount;
 
 	VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount));
 	feature_sync(spa, feature, refcount - 1, tx);
 	spa_history_log_internal(spa, "zhack feature decr", tx,
 	    "name=%s", feature->fi_guid);
 }
 
 static void
 zhack_do_feature_ref(int argc, char **argv)
 {
 	int c;
 	char *target;
 	boolean_t decr = B_FALSE;
 	spa_t *spa;
 	objset_t *mos;
 	zfeature_info_t feature;
 	const spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
 
 	/*
 	 * fi_desc does not matter here because it was written to disk
 	 * when the feature was enabled, but we need to properly set the
 	 * feature for read or write based on the information we read off
 	 * disk later.
 	 */
 	feature.fi_uname = "zhack";
 	feature.fi_flags = 0;
 	feature.fi_desc = NULL;
 	feature.fi_depends = nodeps;
 	feature.fi_feature = SPA_FEATURE_NONE;
 
 	optind = 1;
 	while ((c = getopt(argc, argv, "+md")) != -1) {
 		switch (c) {
 		case 'm':
 			feature.fi_flags |= ZFEATURE_FLAG_MOS;
 			break;
 		case 'd':
 			decr = B_TRUE;
 			break;
 		default:
 			usage();
 			break;
 		}
 	}
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 2) {
 		(void) fprintf(stderr, "error: missing feature or pool name\n");
 		usage();
 	}
 	target = argv[0];
 	feature.fi_guid = argv[1];
 
 	if (!zfeature_is_valid_guid(feature.fi_guid))
 		fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid);
 
 	zhack_spa_open(target, B_FALSE, FTAG, &spa);
 	mos = spa->spa_meta_objset;
 
 	if (zfeature_is_supported(feature.fi_guid)) {
 		fatal(spa, FTAG,
 		    "'%s' is a real feature, will not change refcount",
 		    feature.fi_guid);
 	}
 
 	if (0 == zap_contains(mos, spa->spa_feat_for_read_obj,
 	    feature.fi_guid)) {
 		feature.fi_flags &= ~ZFEATURE_FLAG_READONLY_COMPAT;
 	} else if (0 == zap_contains(mos, spa->spa_feat_for_write_obj,
 	    feature.fi_guid)) {
 		feature.fi_flags |= ZFEATURE_FLAG_READONLY_COMPAT;
 	} else {
 		fatal(spa, FTAG, "feature is not enabled: %s", feature.fi_guid);
 	}
 
 	if (decr) {
 		uint64_t count;
 		if (feature_get_refcount_from_disk(spa, &feature,
 		    &count) == 0 && count == 0) {
 			fatal(spa, FTAG, "feature refcount already 0: %s",
 			    feature.fi_guid);
 		}
 	}
 
 	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
 	    decr ? feature_decr_sync : feature_incr_sync, &feature,
 	    5, ZFS_SPACE_CHECK_NORMAL));
 
 	spa_close(spa, FTAG);
 }
 
 static int
 zhack_do_feature(int argc, char **argv)
 {
 	char *subcommand;
 
 	argc--;
 	argv++;
 	if (argc == 0) {
 		(void) fprintf(stderr,
 		    "error: no feature operation specified\n");
 		usage();
 	}
 
 	subcommand = argv[0];
 	if (strcmp(subcommand, "stat") == 0) {
 		zhack_do_feature_stat(argc, argv);
 	} else if (strcmp(subcommand, "enable") == 0) {
 		zhack_do_feature_enable(argc, argv);
 	} else if (strcmp(subcommand, "ref") == 0) {
 		zhack_do_feature_ref(argc, argv);
 	} else {
 		(void) fprintf(stderr, "error: unknown subcommand: %s\n",
 		    subcommand);
 		usage();
 	}
 
 	return (0);
 }
 
 #define	ASHIFT_UBERBLOCK_SHIFT(ashift)	\
 	MIN(MAX(ashift, UBERBLOCK_SHIFT), \
 	MAX_UBERBLOCK_SHIFT)
 #define	ASHIFT_UBERBLOCK_SIZE(ashift) \
 	(1ULL << ASHIFT_UBERBLOCK_SHIFT(ashift))
 
 #define	REPAIR_LABEL_STATUS_CKSUM (1 << 0)
 #define	REPAIR_LABEL_STATUS_UB    (1 << 1)
 
 static int
 zhack_repair_read_label(const int fd, vdev_label_t *vl,
     const uint64_t label_offset, const int l)
 {
 	const int err = pread64(fd, vl, sizeof (vdev_label_t), label_offset);
 
 	if (err == -1) {
 		(void) fprintf(stderr,
 		    "error: cannot read label %d: %s\n",
 		    l, strerror(errno));
 		return (err);
 	} else if (err != sizeof (vdev_label_t)) {
 		(void) fprintf(stderr,
 		    "error: bad label %d read size\n", l);
 		return (err);
 	}
 
 	return (0);
 }
 
 static void
 zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset,
     const uint64_t abdsize, zio_eck_t *eck, zio_cksum_t *cksum)
 {
 	zio_cksum_t verifier;
 	zio_cksum_t current_cksum;
 	zio_checksum_info_t *ci;
 	abd_t *abd;
 
 	ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0);
 
 	if (byteswap)
 		byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 
 	current_cksum = eck->zec_cksum;
 	eck->zec_cksum = verifier;
 
 	ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL];
 	abd = abd_get_from_buf(data, abdsize);
 	ci->ci_func[byteswap](abd, abdsize, NULL, cksum);
 	abd_free(abd);
 
 	eck->zec_cksum = current_cksum;
 }
 
 static int
 zhack_repair_check_label(uberblock_t *ub, const int l, const char **cfg_keys,
     const size_t cfg_keys_len, nvlist_t *cfg, nvlist_t *vdev_tree_cfg,
     uint64_t *ashift)
 {
 	int err;
 
 	if (ub->ub_txg != 0) {
 		(void) fprintf(stderr,
 		    "error: label %d: UB TXG of 0 expected, but got %"
 		    PRIu64 "\n",
 		    l, ub->ub_txg);
 		(void) fprintf(stderr, "It would appear the device was not "
 		    "properly removed.\n");
 		return (1);
 	}
 
 	for (int i = 0; i < cfg_keys_len; i++) {
 		uint64_t val;
 		err = nvlist_lookup_uint64(cfg, cfg_keys[i], &val);
 		if (err) {
 			(void) fprintf(stderr,
 			    "error: label %d, %d: "
 			    "cannot find nvlist key %s\n",
 			    l, i, cfg_keys[i]);
 			return (err);
 		}
 	}
 
 	err = nvlist_lookup_nvlist(cfg,
 	    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree_cfg);
 	if (err) {
 		(void) fprintf(stderr,
 		    "error: label %d: cannot find nvlist key %s\n",
 		    l, ZPOOL_CONFIG_VDEV_TREE);
 		return (err);
 	}
 
 	err = nvlist_lookup_uint64(vdev_tree_cfg,
 	    ZPOOL_CONFIG_ASHIFT, ashift);
 	if (err) {
 		(void) fprintf(stderr,
 		    "error: label %d: cannot find nvlist key %s\n",
 		    l, ZPOOL_CONFIG_ASHIFT);
 		return (err);
 	}
 
 	if (*ashift == 0) {
 		(void) fprintf(stderr,
 		    "error: label %d: nvlist key %s is zero\n",
 		    l, ZPOOL_CONFIG_ASHIFT);
 		return (err);
 	}
 
 	return (0);
 }
 
 static int
 zhack_repair_undetach(uberblock_t *ub, nvlist_t *cfg, const int l)
 {
 	/*
 	 * Uberblock root block pointer has valid birth TXG.
 	 * Copying it to the label NVlist
 	 */
-	if (ub->ub_rootbp.blk_birth != 0) {
-		const uint64_t txg = ub->ub_rootbp.blk_birth;
+	if (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) != 0) {
+		const uint64_t txg = BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp);
 		ub->ub_txg = txg;
 
 		if (nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG) != 0) {
 			(void) fprintf(stderr,
 			    "error: label %d: "
 			    "Failed to remove pool creation TXG\n",
 			    l);
 			return (1);
 		}
 
 		if (nvlist_remove_all(cfg, ZPOOL_CONFIG_POOL_TXG) != 0) {
 			(void) fprintf(stderr,
 			    "error: label %d: Failed to remove pool TXG to "
 			    "be replaced.\n",
 			    l);
 			return (1);
 		}
 
 		if (nvlist_add_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, txg) != 0) {
 			(void) fprintf(stderr,
 			    "error: label %d: "
 			    "Failed to add pool TXG of %" PRIu64 "\n",
 			    l, txg);
 			return (1);
 		}
 	}
 
 	return (0);
 }
 
 static boolean_t
 zhack_repair_write_label(const int l, const int fd, const int byteswap,
     void *data, zio_eck_t *eck, const uint64_t offset, const uint64_t abdsize)
 {
 	zio_cksum_t actual_cksum;
 	zhack_repair_calc_cksum(byteswap, data, offset, abdsize, eck,
 	    &actual_cksum);
 	zio_cksum_t expected_cksum = eck->zec_cksum;
 	ssize_t err;
 
 	if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
 		return (B_FALSE);
 
 	eck->zec_cksum = actual_cksum;
 
 	err = pwrite64(fd, data, abdsize, offset);
 	if (err == -1) {
 		(void) fprintf(stderr, "error: cannot write label %d: %s\n",
 		    l, strerror(errno));
 		return (B_FALSE);
 	} else if (err != abdsize) {
 		(void) fprintf(stderr, "error: bad write size label %d\n", l);
 		return (B_FALSE);
 	} else {
 		(void) fprintf(stderr,
 		    "label %d: wrote %" PRIu64 " bytes at offset %" PRIu64 "\n",
 		    l, abdsize, offset);
 	}
 
 	return (B_TRUE);
 }
 
 static void
 zhack_repair_write_uberblock(vdev_label_t *vl, const int l,
     const uint64_t ashift, const int fd, const int byteswap,
     const uint64_t label_offset, uint32_t *labels_repaired)
 {
 	void *ub_data =
 	    (char *)vl + offsetof(vdev_label_t, vl_uberblock);
 	zio_eck_t *ub_eck =
 	    (zio_eck_t *)
 	    ((char *)(ub_data) + (ASHIFT_UBERBLOCK_SIZE(ashift))) - 1;
 
 	if (ub_eck->zec_magic != 0) {
 		(void) fprintf(stderr,
 		    "error: label %d: "
 		    "Expected Uberblock checksum magic number to "
 		    "be 0, but got %" PRIu64 "\n",
 		    l, ub_eck->zec_magic);
 		(void) fprintf(stderr, "It would appear there's already "
 		    "a checksum for the uberblock.\n");
 		return;
 	}
 
 
 	ub_eck->zec_magic = byteswap ? BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC;
 
 	if (zhack_repair_write_label(l, fd, byteswap,
 	    ub_data, ub_eck,
 	    label_offset + offsetof(vdev_label_t, vl_uberblock),
 	    ASHIFT_UBERBLOCK_SIZE(ashift)))
 			labels_repaired[l] |= REPAIR_LABEL_STATUS_UB;
 }
 
 static void
 zhack_repair_print_cksum(FILE *stream, const zio_cksum_t *cksum)
 {
 	(void) fprintf(stream,
 	    "%016llx:%016llx:%016llx:%016llx",
 	    (u_longlong_t)cksum->zc_word[0],
 	    (u_longlong_t)cksum->zc_word[1],
 	    (u_longlong_t)cksum->zc_word[2],
 	    (u_longlong_t)cksum->zc_word[3]);
 }
 
 static int
 zhack_repair_test_cksum(const int byteswap, void *vdev_data,
     zio_eck_t *vdev_eck, const uint64_t vdev_phys_offset, const int l)
 {
 	const zio_cksum_t expected_cksum = vdev_eck->zec_cksum;
 	zio_cksum_t actual_cksum;
 	zhack_repair_calc_cksum(byteswap, vdev_data, vdev_phys_offset,
 	    VDEV_PHYS_SIZE, vdev_eck, &actual_cksum);
 	const uint64_t expected_magic = byteswap ?
 	    BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC;
 	const uint64_t actual_magic = vdev_eck->zec_magic;
 	int err = 0;
 	if (actual_magic != expected_magic) {
 		(void) fprintf(stderr, "error: label %d: "
 		    "Expected "
 		    "the nvlist checksum magic number to not be %"
 		    PRIu64 " not %" PRIu64 "\n",
 		    l, expected_magic, actual_magic);
 		err = ECKSUM;
 	}
 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
 		(void) fprintf(stderr, "error: label %d: "
 		    "Expected the nvlist checksum to be ", l);
 		(void) zhack_repair_print_cksum(stderr,
 		    &expected_cksum);
 		(void) fprintf(stderr, " not ");
 		zhack_repair_print_cksum(stderr, &actual_cksum);
 		(void) fprintf(stderr, "\n");
 		err = ECKSUM;
 	}
 	return (err);
 }
 
 static void
 zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
     vdev_label_t *vl, const uint64_t label_offset, const int l,
     uint32_t *labels_repaired)
 {
 	ssize_t err;
 	uberblock_t *ub = (uberblock_t *)vl->vl_uberblock;
 	void *vdev_data =
 	    (char *)vl + offsetof(vdev_label_t, vl_vdev_phys);
 	zio_eck_t *vdev_eck =
 	    (zio_eck_t *)((char *)(vdev_data) + VDEV_PHYS_SIZE) - 1;
 	const uint64_t vdev_phys_offset =
 	    label_offset + offsetof(vdev_label_t, vl_vdev_phys);
 	const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION,
 	    ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID };
 	nvlist_t *cfg;
 	nvlist_t *vdev_tree_cfg = NULL;
 	uint64_t ashift;
 	int byteswap;
 
 	err = zhack_repair_read_label(fd, vl, label_offset, l);
 	if (err)
 		return;
 
 	if (vdev_eck->zec_magic == 0) {
 		(void) fprintf(stderr, "error: label %d: "
 		    "Expected the nvlist checksum magic number to not be zero"
 		    "\n",
 		    l);
 		(void) fprintf(stderr, "There should already be a checksum "
 		    "for the label.\n");
 		return;
 	}
 
 	byteswap =
 	    (vdev_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC));
 
 	if (byteswap) {
 		byteswap_uint64_array(&vdev_eck->zec_cksum,
 		    sizeof (zio_cksum_t));
 		vdev_eck->zec_magic = BSWAP_64(vdev_eck->zec_magic);
 	}
 
 	if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 &&
 	    zhack_repair_test_cksum(byteswap, vdev_data, vdev_eck,
 	    vdev_phys_offset, l) != 0) {
 		(void) fprintf(stderr, "It would appear checksums are "
 		    "corrupted. Try zhack repair label -c <device>\n");
 		return;
 	}
 
 	err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist,
 	    VDEV_PHYS_SIZE - sizeof (zio_eck_t), &cfg, 0);
 	if (err) {
 		(void) fprintf(stderr,
 		    "error: cannot unpack nvlist label %d\n", l);
 		return;
 	}
 
 	err = zhack_repair_check_label(ub,
 	    l, cfg_keys, ARRAY_SIZE(cfg_keys), cfg, vdev_tree_cfg, &ashift);
 	if (err)
 		return;
 
 	if ((op & ZHACK_REPAIR_OP_UNDETACH) != 0) {
 		char *buf;
 		size_t buflen;
 
 		err = zhack_repair_undetach(ub, cfg, l);
 		if (err)
 			return;
 
 		buf = vl->vl_vdev_phys.vp_nvlist;
 		buflen = VDEV_PHYS_SIZE - sizeof (zio_eck_t);
 		if (nvlist_pack(cfg, &buf, &buflen, NV_ENCODE_XDR, 0) != 0) {
 			(void) fprintf(stderr,
 			    "error: label %d: Failed to pack nvlist\n", l);
 			return;
 		}
 
 		zhack_repair_write_uberblock(vl,
 		    l, ashift, fd, byteswap, label_offset, labels_repaired);
 	}
 
 	if (zhack_repair_write_label(l, fd, byteswap, vdev_data, vdev_eck,
 	    vdev_phys_offset, VDEV_PHYS_SIZE))
 			labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
 
 	fsync(fd);
 }
 
 static const char *
 zhack_repair_label_status(const uint32_t label_status,
     const uint32_t to_check)
 {
 	return ((label_status & to_check) != 0 ? "repaired" : "skipped");
 }
 
 static int
 zhack_label_repair(const zhack_repair_op_t op, const int argc, char **argv)
 {
 	uint32_t labels_repaired[VDEV_LABELS] = {0};
 	vdev_label_t labels[VDEV_LABELS] = {{{0}}};
 	struct stat64 st;
 	int fd;
 	off_t filesize;
 	uint32_t repaired = 0;
 
 	abd_init();
 
 	if (argc < 1) {
 		(void) fprintf(stderr, "error: missing device\n");
 		usage();
 	}
 
 	if ((fd = open(argv[0], O_RDWR)) == -1)
 		fatal(NULL, FTAG, "cannot open '%s': %s", argv[0],
 		    strerror(errno));
 
 	if (fstat64_blk(fd, &st) != 0)
 		fatal(NULL, FTAG, "cannot stat '%s': %s", argv[0],
 		    strerror(errno));
 
 	filesize = st.st_size;
 	(void) fprintf(stderr, "Calculated filesize to be %jd\n",
 	    (intmax_t)filesize);
 
 	if (filesize % sizeof (vdev_label_t) != 0)
 		filesize =
 		    (filesize / sizeof (vdev_label_t)) * sizeof (vdev_label_t);
 
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		zhack_repair_one_label(op, fd, &labels[l],
 		    vdev_label_offset(filesize, l, 0), l, labels_repaired);
 	}
 
 	close(fd);
 
 	abd_fini();
 
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		const uint32_t lr = labels_repaired[l];
 		(void) printf("label %d: ", l);
 		(void) printf("uberblock: %s ",
 		    zhack_repair_label_status(lr, REPAIR_LABEL_STATUS_UB));
 		(void) printf("checksum: %s\n",
 		    zhack_repair_label_status(lr, REPAIR_LABEL_STATUS_CKSUM));
 		repaired |= lr;
 	}
 
 	if (repaired > 0)
 		return (0);
 
 	return (1);
 }
 
 static int
 zhack_do_label_repair(int argc, char **argv)
 {
 	zhack_repair_op_t op = ZHACK_REPAIR_OP_UNKNOWN;
 	int c;
 
 	optind = 1;
 	while ((c = getopt(argc, argv, "+cu")) != -1) {
 		switch (c) {
 		case 'c':
 			op |= ZHACK_REPAIR_OP_CKSUM;
 			break;
 		case 'u':
 			op |= ZHACK_REPAIR_OP_UNDETACH;
 			break;
 		default:
 			usage();
 			break;
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (op == ZHACK_REPAIR_OP_UNKNOWN)
 		op = ZHACK_REPAIR_OP_CKSUM;
 
 	return (zhack_label_repair(op, argc, argv));
 }
 
 static int
 zhack_do_label(int argc, char **argv)
 {
 	char *subcommand;
 	int err;
 
 	argc--;
 	argv++;
 	if (argc == 0) {
 		(void) fprintf(stderr,
 		    "error: no label operation specified\n");
 		usage();
 	}
 
 	subcommand = argv[0];
 	if (strcmp(subcommand, "repair") == 0) {
 		err = zhack_do_label_repair(argc, argv);
 	} else {
 		(void) fprintf(stderr, "error: unknown subcommand: %s\n",
 		    subcommand);
 		usage();
 	}
 
 	return (err);
 }
 
 #define	MAX_NUM_PATHS 1024
 
 int
 main(int argc, char **argv)
 {
 	char *path[MAX_NUM_PATHS];
 	const char *subcommand;
 	int rv = 0;
 	int c;
 
 	g_importargs.path = path;
 
 	dprintf_setup(&argc, argv);
 	zfs_prop_init();
 
 	while ((c = getopt(argc, argv, "+c:d:")) != -1) {
 		switch (c) {
 		case 'c':
 			g_importargs.cachefile = optarg;
 			break;
 		case 'd':
 			assert(g_importargs.paths < MAX_NUM_PATHS);
 			g_importargs.path[g_importargs.paths++] = optarg;
 			break;
 		default:
 			usage();
 			break;
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 	optind = 1;
 
 	if (argc == 0) {
 		(void) fprintf(stderr, "error: no command specified\n");
 		usage();
 	}
 
 	subcommand = argv[0];
 
 	if (strcmp(subcommand, "feature") == 0) {
 		rv = zhack_do_feature(argc, argv);
 	} else if (strcmp(subcommand, "label") == 0) {
 		return (zhack_do_label(argc, argv));
 	} else {
 		(void) fprintf(stderr, "error: unknown subcommand: %s\n",
 		    subcommand);
 		usage();
 	}
 
 	if (!g_readonly && spa_export(g_pool, NULL, B_TRUE, B_FALSE) != 0) {
 		fatal(NULL, FTAG, "pool export failed; "
 		    "changes may not be committed to disk\n");
 	}
 
 	kernel_fini();
 
 	return (rv);
 }
diff --git a/include/sys/spa.h b/include/sys/spa.h
index cada3c841037..fb4c93431a31 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1,1247 +1,1250 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Datto Inc.
  */
 
 #ifndef _SYS_SPA_H
 #define	_SYS_SPA_H
 
 #include <sys/avl.h>
 #include <sys/zfs_context.h>
 #include <sys/kstat.h>
 #include <sys/nvpair.h>
 #include <sys/sysmacros.h>
 #include <sys/types.h>
 #include <sys/fs/zfs.h>
 #include <sys/spa_checksum.h>
 #include <sys/dmu.h>
 #include <sys/space_map.h>
 #include <sys/bitops.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Forward references that lots of things need.
  */
 typedef struct spa spa_t;
 typedef struct vdev vdev_t;
 typedef struct metaslab metaslab_t;
 typedef struct metaslab_group metaslab_group_t;
 typedef struct metaslab_class metaslab_class_t;
 typedef struct zio zio_t;
 typedef struct zilog zilog_t;
 typedef struct spa_aux_vdev spa_aux_vdev_t;
 typedef struct zbookmark_phys zbookmark_phys_t;
 typedef struct zbookmark_err_phys zbookmark_err_phys_t;
 
 struct bpobj;
 struct bplist;
 struct dsl_pool;
 struct dsl_dataset;
 struct dsl_crypto_params;
 
 /*
  * Alignment Shift (ashift) is an immutable, internal top-level vdev property
  * which can only be set at vdev creation time. Physical writes are always done
  * according to it, which makes 2^ashift the smallest possible IO on a vdev.
  *
  * We currently allow values ranging from 512 bytes (2^9 = 512) to 64 KiB
  * (2^16 = 65,536).
  */
 #define	ASHIFT_MIN		9
 #define	ASHIFT_MAX		16
 
 /*
  * Size of block to hold the configuration data (a packed nvlist)
  */
 #define	SPA_CONFIG_BLOCKSIZE	(1ULL << 14)
 
 /*
  * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
  * The ASIZE encoding should be at least 64 times larger (6 more bits)
  * to support up to 4-way RAID-Z mirror mode with worst-case gang block
  * overhead, three DVAs per bp, plus one more bit in case we do anything
  * else that expands the ASIZE.
  */
 #define	SPA_LSIZEBITS		16	/* LSIZE up to 32M (2^16 * 512)	*/
 #define	SPA_PSIZEBITS		16	/* PSIZE up to 32M (2^16 * 512)	*/
 #define	SPA_ASIZEBITS		24	/* ASIZE up to 64 times larger	*/
 
 #define	SPA_COMPRESSBITS	7
 #define	SPA_VDEVBITS		24
 #define	SPA_COMPRESSMASK	((1U << SPA_COMPRESSBITS) - 1)
 
 /*
  * All SPA data is represented by 128-bit data virtual addresses (DVAs).
  * The members of the dva_t should be considered opaque outside the SPA.
  */
 typedef struct dva {
 	uint64_t	dva_word[2];
 } dva_t;
 
 
 /*
  * Some checksums/hashes need a 256-bit initialization salt. This salt is kept
  * secret and is suitable for use in MAC algorithms as the key.
  */
 typedef struct zio_cksum_salt {
 	uint8_t		zcs_bytes[32];
 } zio_cksum_salt_t;
 
 /*
  * Each block is described by its DVAs, time of birth, checksum, etc.
  * The word-by-word, bit-by-bit layout of the blkptr is as follows:
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 0	|  pad  |	  vdev1         | GRID  |	  ASIZE		|
+ * 0	|  pad  |	  vdev1         | pad   |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 1	|G|			 offset1				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 2	|  pad  |	  vdev2         | GRID  |	  ASIZE		|
+ * 2	|  pad  |	  vdev2         | pad   |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 3	|G|			 offset2				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 4	|  pad  |	  vdev3         | GRID  |	  ASIZE		|
+ * 4	|  pad  |	  vdev3         | pad   |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|G|			 offset3				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 6	|BDX|lvl| type	| cksum |E| comp|    PSIZE	|     LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 8	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 9	|			physical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * a	|			logical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * b	|			fill count				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * c	|			checksum[0]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * d	|			checksum[1]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * e	|			checksum[2]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * f	|			checksum[3]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  *
  * Legend:
  *
  * vdev		virtual device ID
  * offset	offset into virtual device
  * LSIZE	logical size
  * PSIZE	physical size (after compression)
  * ASIZE	allocated size (including RAID-Z parity and gang block headers)
- * GRID		RAID-Z layout information (reserved for future use)
  * cksum	checksum function
  * comp		compression function
  * G		gang block indicator
  * B		byteorder (endianness)
  * D		dedup
  * X		encryption
  * E		blkptr_t contains embedded data (see below)
  * lvl		level of indirection
  * type		DMU object type
  * phys birth	txg when dva[0] was written; zero if same as logical birth txg
  *              note that typically all the dva's would be written in this
  *              txg, but they could be different if they were moved by
  *              device removal.
  * log. birth	transaction group in which the block was logically born
  * fill count	number of non-zero blocks under this bp
  * checksum[4]	256-bit checksum of the data this bp describes
  */
 
 /*
  * The blkptr_t's of encrypted blocks also need to store the encryption
  * parameters so that the block can be decrypted. This layout is as follows:
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 0	|		vdev1		| GRID  |	  ASIZE		|
+ * 0	|		vdev1		| pad   |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 1	|G|			 offset1				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 2	|		vdev2		| GRID  |	  ASIZE		|
+ * 2	|		vdev2		| pad   |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 3	|G|			 offset2				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 4	|			salt					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|			IV1					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 6	|BDX|lvl| type	| cksum |E| comp|    PSIZE	|     LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 8	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 9	|			physical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * a	|			logical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * b	|		IV2		|	    fill count		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * c	|			checksum[0]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * d	|			checksum[1]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * e	|			MAC[0]					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * f	|			MAC[1]					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  *
  * Legend:
  *
  * salt		Salt for generating encryption keys
  * IV1		First 64 bits of encryption IV
  * X		Block requires encryption handling (set to 1)
  * E		blkptr_t contains embedded data (set to 0, see below)
  * fill count	number of non-zero blocks under this bp (truncated to 32 bits)
  * IV2		Last 32 bits of encryption IV
  * checksum[2]	128-bit checksum of the data this bp describes
  * MAC[2]	128-bit message authentication code for this data
  *
  * The X bit being set indicates that this block is one of 3 types. If this is
  * a level 0 block with an encrypted object type, the block is encrypted
  * (see BP_IS_ENCRYPTED()). If this is a level 0 block with an unencrypted
  * object type, this block is authenticated with an HMAC (see
  * BP_IS_AUTHENTICATED()). Otherwise (if level > 0), this bp will use the MAC
  * words to store a checksum-of-MACs from the level below (see
  * BP_HAS_INDIRECT_MAC_CKSUM()). For convenience in the code, BP_IS_PROTECTED()
  * refers to both encrypted and authenticated blocks and BP_USES_CRYPT()
  * refers to any of these 3 kinds of blocks.
  *
  * The additional encryption parameters are the salt, IV, and MAC which are
  * explained in greater detail in the block comment at the top of zio_crypt.c.
  * The MAC occupies half of the checksum space since it serves a very similar
  * purpose: to prevent data corruption on disk. The only functional difference
  * is that the checksum is used to detect on-disk corruption whether or not the
  * encryption key is loaded and the MAC provides additional protection against
  * malicious disk tampering. We use the 3rd DVA to store the salt and first
  * 64 bits of the IV. As a result encrypted blocks can only have 2 copies
  * maximum instead of the normal 3. The last 32 bits of the IV are stored in
  * the upper bits of what is usually the fill count. Note that only blocks at
  * level 0 or -2 are ever encrypted, which allows us to guarantee that these
  * 32 bits are not trampled over by other code (see zio_crypt.c for details).
  * The salt and IV are not used for authenticated bps or bps with an indirect
  * MAC checksum, so these blocks can utilize all 3 DVAs and the full 64 bits
  * for the fill count.
  */
 
 /*
  * "Embedded" blkptr_t's don't actually point to a block, instead they
  * have a data payload embedded in the blkptr_t itself.  See the comment
  * in blkptr.c for more details.
  *
  * The blkptr_t is laid out as follows:
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 0	|      payload                                                  |
  * 1	|      payload                                                  |
  * 2	|      payload                                                  |
  * 3	|      payload                                                  |
  * 4	|      payload                                                  |
  * 5	|      payload                                                  |
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 6	|BDX|lvl| type	| etype |E| comp| PSIZE|              LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|      payload                                                  |
  * 8	|      payload                                                  |
  * 9	|      payload                                                  |
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * a	|			logical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * b	|      payload                                                  |
  * c	|      payload                                                  |
  * d	|      payload                                                  |
  * e	|      payload                                                  |
  * f	|      payload                                                  |
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  *
  * Legend:
  *
  * payload		contains the embedded data
  * B (byteorder)	byteorder (endianness)
  * D (dedup)		padding (set to zero)
  * X			encryption (set to zero)
  * E (embedded)		set to one
  * lvl			indirection level
  * type			DMU object type
  * etype		how to interpret embedded data (BP_EMBEDDED_TYPE_*)
  * comp			compression function of payload
  * PSIZE		size of payload after compression, in bytes
  * LSIZE		logical size of payload, in bytes
  *			note that 25 bits is enough to store the largest
  *			"normal" BP's LSIZE (2^16 * 2^9) in bytes
  * log. birth		transaction group in which the block was logically born
  *
  * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
  * bp's they are stored in units of SPA_MINBLOCKSHIFT.
  * Generally, the generic BP_GET_*() macros can be used on embedded BP's.
  * The B, D, X, lvl, type, and comp fields are stored the same as with normal
  * BP's so the BP_SET_* macros can be used with them.  etype, PSIZE, LSIZE must
  * be set with the BPE_SET_* macros.  BP_SET_EMBEDDED() should be called before
  * other macros, as they assert that they are only used on BP's of the correct
  * "embedded-ness". Encrypted blkptr_t's cannot be embedded because they use
  * the payload space for encryption parameters (see the comment above on
  * how encryption parameters are stored).
  */
 
 #define	BPE_GET_ETYPE(bp)	\
 	(ASSERT(BP_IS_EMBEDDED(bp)), \
 	BF64_GET((bp)->blk_prop, 40, 8))
 #define	BPE_SET_ETYPE(bp, t)	do { \
 	ASSERT(BP_IS_EMBEDDED(bp)); \
 	BF64_SET((bp)->blk_prop, 40, 8, t); \
 } while (0)
 
 #define	BPE_GET_LSIZE(bp)	\
 	(ASSERT(BP_IS_EMBEDDED(bp)), \
 	BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
 #define	BPE_SET_LSIZE(bp, x)	do { \
 	ASSERT(BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
 } while (0)
 
 #define	BPE_GET_PSIZE(bp)	\
 	(ASSERT(BP_IS_EMBEDDED(bp)), \
 	BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
 #define	BPE_SET_PSIZE(bp, x)	do { \
 	ASSERT(BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
 } while (0)
 
 typedef enum bp_embedded_type {
 	BP_EMBEDDED_TYPE_DATA,
 	BP_EMBEDDED_TYPE_RESERVED, /* Reserved for Delphix byteswap feature. */
 	BP_EMBEDDED_TYPE_REDACTED,
 	NUM_BP_EMBEDDED_TYPES
 } bp_embedded_type_t;
 
 #define	BPE_NUM_WORDS 14
 #define	BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
 #define	BPE_IS_PAYLOADWORD(bp, wp) \
-	((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
+	((wp) != &(bp)->blk_prop && (wp) != (&(bp)->blk_birth_word[1]))
 
 #define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
 #define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
 #define	SPA_SYNC_MIN_VDEVS 3		/* min vdevs to update during sync */
 
 /*
  * A block is a hole when it has either 1) never been written to, or
  * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads
  * without physically allocating disk space. Holes are represented in the
  * blkptr_t structure by zeroed blk_dva. Correct checking for holes is
  * done through the BP_IS_HOLE macro. For holes, the logical size, level,
  * DMU object type, and birth times are all also stored for holes that
  * were written to at some point (i.e. were punched after having been filled).
  */
 typedef struct blkptr {
 	dva_t		blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
 	uint64_t	blk_prop;	/* size, compression, type, etc	    */
 	uint64_t	blk_pad[2];	/* Extra space for the future	    */
-	uint64_t	blk_phys_birth;	/* txg when block was allocated	    */
-	uint64_t	blk_birth;	/* transaction group at birth	    */
+	uint64_t	blk_birth_word[2];
 	uint64_t	blk_fill;	/* fill count			    */
 	zio_cksum_t	blk_cksum;	/* 256-bit checksum		    */
 } blkptr_t;
 
 /*
  * Macros to get and set fields in a bp or DVA.
  */
 
 /*
  * Note, for gang blocks, DVA_GET_ASIZE() is the total space allocated for
  * this gang DVA including its children BP's.  The space allocated at this
  * DVA's vdev/offset is vdev_gang_header_asize(vdev).
  */
 #define	DVA_GET_ASIZE(dva)	\
 	BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
 #define	DVA_SET_ASIZE(dva, x)	\
 	BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
 	SPA_MINBLOCKSHIFT, 0, x)
 
-#define	DVA_GET_GRID(dva)	BF64_GET((dva)->dva_word[0], 24, 8)
-#define	DVA_SET_GRID(dva, x)	BF64_SET((dva)->dva_word[0], 24, 8, x)
-
 #define	DVA_GET_VDEV(dva)	BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS)
 #define	DVA_SET_VDEV(dva, x)	\
 	BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x)
 
 #define	DVA_GET_OFFSET(dva)	\
 	BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
 #define	DVA_SET_OFFSET(dva, x)	\
 	BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
 
 #define	DVA_GET_GANG(dva)	BF64_GET((dva)->dva_word[1], 63, 1)
 #define	DVA_SET_GANG(dva, x)	BF64_SET((dva)->dva_word[1], 63, 1, x)
 
 #define	BP_GET_LSIZE(bp)	\
 	(BP_IS_EMBEDDED(bp) ?	\
 	(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
 	BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
 #define	BP_SET_LSIZE(bp, x)	do { \
 	ASSERT(!BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, \
 	    0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
 } while (0)
 
 #define	BP_GET_PSIZE(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1))
 #define	BP_SET_PSIZE(bp, x)	do { \
 	ASSERT(!BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, \
 	    16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
 } while (0)
 
 #define	BP_GET_COMPRESS(bp)		\
 	BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS)
 #define	BP_SET_COMPRESS(bp, x)		\
 	BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x)
 
 #define	BP_IS_EMBEDDED(bp)		BF64_GET((bp)->blk_prop, 39, 1)
 #define	BP_SET_EMBEDDED(bp, x)		BF64_SET((bp)->blk_prop, 39, 1, x)
 
 #define	BP_GET_CHECKSUM(bp)		\
 	(BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \
 	BF64_GET((bp)->blk_prop, 40, 8))
 #define	BP_SET_CHECKSUM(bp, x)		do { \
 	ASSERT(!BP_IS_EMBEDDED(bp)); \
 	BF64_SET((bp)->blk_prop, 40, 8, x); \
 } while (0)
 
 #define	BP_GET_TYPE(bp)			BF64_GET((bp)->blk_prop, 48, 8)
 #define	BP_SET_TYPE(bp, x)		BF64_SET((bp)->blk_prop, 48, 8, x)
 
 #define	BP_GET_LEVEL(bp)		BF64_GET((bp)->blk_prop, 56, 5)
 #define	BP_SET_LEVEL(bp, x)		BF64_SET((bp)->blk_prop, 56, 5, x)
 
 /* encrypted, authenticated, and MAC cksum bps use the same bit */
 #define	BP_USES_CRYPT(bp)		BF64_GET((bp)->blk_prop, 61, 1)
 #define	BP_SET_CRYPT(bp, x)		BF64_SET((bp)->blk_prop, 61, 1, x)
 
 #define	BP_IS_ENCRYPTED(bp)			\
 	(BP_USES_CRYPT(bp) &&			\
 	BP_GET_LEVEL(bp) <= 0 &&		\
 	DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp)))
 
 #define	BP_IS_AUTHENTICATED(bp)			\
 	(BP_USES_CRYPT(bp) &&			\
 	BP_GET_LEVEL(bp) <= 0 &&		\
 	!DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp)))
 
 #define	BP_HAS_INDIRECT_MAC_CKSUM(bp)		\
 	(BP_USES_CRYPT(bp) && BP_GET_LEVEL(bp) > 0)
 
 #define	BP_IS_PROTECTED(bp)			\
 	(BP_IS_ENCRYPTED(bp) || BP_IS_AUTHENTICATED(bp))
 
 #define	BP_GET_DEDUP(bp)		BF64_GET((bp)->blk_prop, 62, 1)
 #define	BP_SET_DEDUP(bp, x)		BF64_SET((bp)->blk_prop, 62, 1, x)
 
 #define	BP_GET_BYTEORDER(bp)		BF64_GET((bp)->blk_prop, 63, 1)
 #define	BP_SET_BYTEORDER(bp, x)		BF64_SET((bp)->blk_prop, 63, 1, x)
 
 #define	BP_GET_FREE(bp)			BF64_GET((bp)->blk_fill, 0, 1)
 #define	BP_SET_FREE(bp, x)		BF64_SET((bp)->blk_fill, 0, 1, x)
 
-#define	BP_PHYSICAL_BIRTH(bp)		\
-	(BP_IS_EMBEDDED(bp) ? 0 : \
-	(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
+#define	BP_GET_LOGICAL_BIRTH(bp)	(bp)->blk_birth_word[1]
+#define	BP_SET_LOGICAL_BIRTH(bp, x)	((bp)->blk_birth_word[1] = (x))
+
+#define	BP_GET_PHYSICAL_BIRTH(bp)	(bp)->blk_birth_word[0]
+#define	BP_SET_PHYSICAL_BIRTH(bp, x)	((bp)->blk_birth_word[0] = (x))
+
+#define	BP_GET_BIRTH(bp)					\
+	(BP_IS_EMBEDDED(bp) ? 0 : 				\
+	BP_GET_PHYSICAL_BIRTH(bp) ? BP_GET_PHYSICAL_BIRTH(bp) :	\
+	BP_GET_LOGICAL_BIRTH(bp))
 
 #define	BP_SET_BIRTH(bp, logical, physical)	\
 {						\
 	ASSERT(!BP_IS_EMBEDDED(bp));		\
-	(bp)->blk_birth = (logical);		\
-	(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
+	BP_SET_LOGICAL_BIRTH(bp, logical);	\
+	BP_SET_PHYSICAL_BIRTH(bp, 		\
+	    ((logical) == (physical) ? 0 : (physical))); \
 }
 
 #define	BP_GET_FILL(bp)				\
 	((BP_IS_ENCRYPTED(bp)) ? BF64_GET((bp)->blk_fill, 0, 32) : \
 	((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill))
 
 #define	BP_SET_FILL(bp, fill)			\
 {						\
 	if (BP_IS_ENCRYPTED(bp))			\
 		BF64_SET((bp)->blk_fill, 0, 32, fill); \
 	else					\
 		(bp)->blk_fill = fill;		\
 }
 
 #define	BP_GET_IV2(bp)				\
 	(ASSERT(BP_IS_ENCRYPTED(bp)),		\
 	BF64_GET((bp)->blk_fill, 32, 32))
 #define	BP_SET_IV2(bp, iv2)			\
 {						\
 	ASSERT(BP_IS_ENCRYPTED(bp));		\
 	BF64_SET((bp)->blk_fill, 32, 32, iv2);	\
 }
 
 #define	BP_IS_METADATA(bp)	\
 	(BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
 
 #define	BP_GET_ASIZE(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
 	DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
 	(DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))
 
 #define	BP_GET_UCSIZE(bp)	\
 	(BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
 
 #define	BP_GET_NDVAS(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
 	!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
 	(!!DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))
 
 #define	BP_COUNT_GANG(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
 	DVA_GET_GANG(&(bp)->blk_dva[1]) + \
 	(DVA_GET_GANG(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))))
 
 #define	DVA_EQUAL(dva1, dva2)	\
 	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
 	(dva1)->dva_word[0] == (dva2)->dva_word[0])
 
 #define	BP_EQUAL(bp1, bp2)	\
-	(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) &&	\
-	(bp1)->blk_birth == (bp2)->blk_birth &&			\
+	(BP_GET_BIRTH(bp1) == BP_GET_BIRTH(bp2) &&	\
+	BP_GET_LOGICAL_BIRTH(bp1) == BP_GET_LOGICAL_BIRTH(bp2) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
 
 
 #define	DVA_IS_VALID(dva)	(DVA_GET_ASIZE(dva) != 0)
 
 #define	BP_IDENTITY(bp)		(ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0])
 #define	BP_IS_GANG(bp)		\
 	(BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp)))
 #define	DVA_IS_EMPTY(dva)	((dva)->dva_word[0] == 0ULL &&	\
 				(dva)->dva_word[1] == 0ULL)
 #define	BP_IS_HOLE(bp) \
 	(!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp)))
 
 #define	BP_SET_REDACTED(bp) \
 {							\
 	BP_SET_EMBEDDED(bp, B_TRUE);			\
 	BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_REDACTED);	\
 }
 #define	BP_IS_REDACTED(bp) \
 	(BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_REDACTED)
 
 /* BP_IS_RAIDZ(bp) assumes no block compression */
 #define	BP_IS_RAIDZ(bp)		(DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
 				BP_GET_PSIZE(bp))
 
 #define	BP_ZERO(bp)				\
 {						\
 	(bp)->blk_dva[0].dva_word[0] = 0;	\
 	(bp)->blk_dva[0].dva_word[1] = 0;	\
 	(bp)->blk_dva[1].dva_word[0] = 0;	\
 	(bp)->blk_dva[1].dva_word[1] = 0;	\
 	(bp)->blk_dva[2].dva_word[0] = 0;	\
 	(bp)->blk_dva[2].dva_word[1] = 0;	\
 	(bp)->blk_prop = 0;			\
 	(bp)->blk_pad[0] = 0;			\
 	(bp)->blk_pad[1] = 0;			\
-	(bp)->blk_phys_birth = 0;		\
-	(bp)->blk_birth = 0;			\
+	(bp)->blk_birth_word[0] = 0;		\
+	(bp)->blk_birth_word[1] = 0;		\
 	(bp)->blk_fill = 0;			\
 	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
 }
 
 #ifdef _ZFS_BIG_ENDIAN
 #define	ZFS_HOST_BYTEORDER	(0ULL)
 #else
 #define	ZFS_HOST_BYTEORDER	(1ULL)
 #endif
 
 #define	BP_SHOULD_BYTESWAP(bp)	(BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
 
 #define	BP_SPRINTF_LEN	400
 
 /*
  * This macro allows code sharing between zfs, libzpool, and mdb.
  * 'func' is either kmem_scnprintf() or mdb_snprintf().
  * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
  */
 
 #define	SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
 {									\
 	static const char *const copyname[] =				\
 	    { "zero", "single", "double", "triple" };			\
 	int len = 0;							\
 	int copies = 0;							\
 	const char *crypt_type;						\
 	if (bp != NULL) {						\
 		if (BP_IS_ENCRYPTED(bp)) {				\
 			crypt_type = "encrypted";			\
 			/* LINTED E_SUSPICIOUS_COMPARISON */		\
 		} else if (BP_IS_AUTHENTICATED(bp)) {			\
 			crypt_type = "authenticated";			\
 		} else if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {		\
 			crypt_type = "indirect-MAC";			\
 		} else {						\
 			crypt_type = "unencrypted";			\
 		}							\
 	}								\
 	if (bp == NULL) {						\
 		len += func(buf + len, size - len, "<NULL>");		\
 	} else if (BP_IS_HOLE(bp)) {					\
 		len += func(buf + len, size - len,			\
 		    "HOLE [L%llu %s] "					\
 		    "size=%llxL birth=%lluL",				\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
-		    (u_longlong_t)bp->blk_birth);			\
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));		\
 	} else if (BP_IS_EMBEDDED(bp)) {				\
 		len = func(buf + len, size - len,			\
 		    "EMBEDDED [L%llu %s] et=%u %s "			\
 		    "size=%llxL/%llxP birth=%lluL",			\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (int)BPE_GET_ETYPE(bp),				\
 		    compress,						\
 		    (u_longlong_t)BPE_GET_LSIZE(bp),			\
 		    (u_longlong_t)BPE_GET_PSIZE(bp),			\
-		    (u_longlong_t)bp->blk_birth);			\
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));		\
 	} else if (BP_IS_REDACTED(bp)) {				\
 		len += func(buf + len, size - len,			\
 		    "REDACTED [L%llu %s] size=%llxL birth=%lluL",	\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
-		    (u_longlong_t)bp->blk_birth);			\
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));		\
 	} else {							\
 		for (int d = 0; d < BP_GET_NDVAS(bp); d++) {		\
 			const dva_t *dva = &bp->blk_dva[d];		\
 			if (DVA_IS_VALID(dva))				\
 				copies++;				\
 			len += func(buf + len, size - len,		\
 			    "DVA[%d]=<%llu:%llx:%llx>%c", d,		\
 			    (u_longlong_t)DVA_GET_VDEV(dva),		\
 			    (u_longlong_t)DVA_GET_OFFSET(dva),		\
 			    (u_longlong_t)DVA_GET_ASIZE(dva),		\
 			    ws);					\
 		}							\
 		ASSERT3S(copies, >, 0);					\
 		if (BP_IS_ENCRYPTED(bp)) {				\
 			len += func(buf + len, size - len,		\
 			    "salt=%llx iv=%llx:%llx%c",			\
 			    (u_longlong_t)bp->blk_dva[2].dva_word[0],	\
 			    (u_longlong_t)bp->blk_dva[2].dva_word[1],	\
 			    (u_longlong_t)BP_GET_IV2(bp),		\
 			    ws);					\
 		}							\
 		if (BP_IS_GANG(bp) &&					\
 		    DVA_GET_ASIZE(&bp->blk_dva[2]) <=			\
 		    DVA_GET_ASIZE(&bp->blk_dva[1]) / 2)			\
 			copies--;					\
 		len += func(buf + len, size - len,			\
 		    "[L%llu %s] %s %s %s %s %s %s %s%c"			\
 		    "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c"	\
 		    "cksum=%016llx:%016llx:%016llx:%016llx",		\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    checksum,						\
 		    compress,						\
 		    crypt_type,						\
 		    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",		\
 		    BP_IS_GANG(bp) ? "gang" : "contiguous",		\
 		    BP_GET_DEDUP(bp) ? "dedup" : "unique",		\
 		    copyname[copies],					\
 		    ws,							\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
 		    (u_longlong_t)BP_GET_PSIZE(bp),			\
-		    (u_longlong_t)bp->blk_birth,			\
-		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp),		\
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp),		\
+		    (u_longlong_t)BP_GET_BIRTH(bp),			\
 		    (u_longlong_t)BP_GET_FILL(bp),			\
 		    ws,							\
 		    (u_longlong_t)bp->blk_cksum.zc_word[0],		\
 		    (u_longlong_t)bp->blk_cksum.zc_word[1],		\
 		    (u_longlong_t)bp->blk_cksum.zc_word[2],		\
 		    (u_longlong_t)bp->blk_cksum.zc_word[3]);		\
 	}								\
 	ASSERT(len < size);						\
 }
 
 #define	BP_GET_BUFC_TYPE(bp)						\
 	(BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
 
 typedef enum spa_import_type {
 	SPA_IMPORT_EXISTING,
 	SPA_IMPORT_ASSEMBLE
 } spa_import_type_t;
 
 typedef enum spa_mode {
 	SPA_MODE_UNINIT = 0,
 	SPA_MODE_READ = 1,
 	SPA_MODE_WRITE = 2,
 } spa_mode_t;
 
 /*
  * Send TRIM commands in-line during normal pool operation while deleting.
  *	OFF: no
  *	ON: yes
  */
 typedef enum {
 	SPA_AUTOTRIM_OFF = 0,	/* default */
 	SPA_AUTOTRIM_ON,
 } spa_autotrim_t;
 
 /*
  * Reason TRIM command was issued, used internally for accounting purposes.
  */
 typedef enum trim_type {
 	TRIM_TYPE_MANUAL = 0,
 	TRIM_TYPE_AUTO = 1,
 	TRIM_TYPE_SIMPLE = 2
 } trim_type_t;
 
 /* state manipulation functions */
 extern int spa_open(const char *pool, spa_t **, const void *tag);
 extern int spa_open_rewind(const char *pool, spa_t **, const void *tag,
     nvlist_t *policy, nvlist_t **config);
 extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot,
     size_t buflen);
 extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
     nvlist_t *zplprops, struct dsl_crypto_params *dcp);
 extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props,
     uint64_t flags);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(const char *pool);
 extern int spa_checkpoint(const char *pool);
 extern int spa_checkpoint_discard(const char *pool);
 extern int spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force,
     boolean_t hardforce);
 extern int spa_reset(const char *pool);
 extern void spa_async_request(spa_t *spa, int flag);
 extern void spa_async_unrequest(spa_t *spa, int flag);
 extern void spa_async_suspend(spa_t *spa);
 extern void spa_async_resume(spa_t *spa);
 extern int spa_async_tasks(spa_t *spa);
 extern spa_t *spa_inject_addref(char *pool);
 extern void spa_inject_delref(spa_t *spa);
 extern void spa_scan_stat_init(spa_t *spa);
 extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
 extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 
 #define	SPA_ASYNC_CONFIG_UPDATE			0x01
 #define	SPA_ASYNC_REMOVE			0x02
 #define	SPA_ASYNC_PROBE				0x04
 #define	SPA_ASYNC_RESILVER_DONE			0x08
 #define	SPA_ASYNC_RESILVER			0x10
 #define	SPA_ASYNC_AUTOEXPAND			0x20
 #define	SPA_ASYNC_REMOVE_DONE			0x40
 #define	SPA_ASYNC_REMOVE_STOP			0x80
 #define	SPA_ASYNC_INITIALIZE_RESTART		0x100
 #define	SPA_ASYNC_TRIM_RESTART			0x200
 #define	SPA_ASYNC_AUTOTRIM_RESTART		0x400
 #define	SPA_ASYNC_L2CACHE_REBUILD		0x800
 #define	SPA_ASYNC_L2CACHE_TRIM			0x1000
 #define	SPA_ASYNC_REBUILD_DONE			0x2000
 #define	SPA_ASYNC_DETACH_SPARE			0x4000
 
 /* device manipulation */
 extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
 extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
     int replacing, int rebuild);
 extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
     int replace_done);
 extern int spa_vdev_alloc(spa_t *spa, uint64_t guid);
 extern int spa_vdev_noalloc(spa_t *spa, uint64_t guid);
 extern boolean_t spa_vdev_remove_active(spa_t *spa);
 extern int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
     nvlist_t *vdev_errlist);
 extern int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
     uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
 extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
 extern int spa_vdev_split_mirror(spa_t *spa, const char *newname,
     nvlist_t *config, nvlist_t *props, boolean_t exp);
 
 /* spare state (which is global across all pools) */
 extern void spa_spare_add(vdev_t *vd);
 extern void spa_spare_remove(vdev_t *vd);
 extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt);
 extern void spa_spare_activate(vdev_t *vd);
 
 /* L2ARC state (which is global across all pools) */
 extern void spa_l2cache_add(vdev_t *vd);
 extern void spa_l2cache_remove(vdev_t *vd);
 extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
 extern void spa_l2cache_activate(vdev_t *vd);
 extern void spa_l2cache_drop(spa_t *spa);
 
 /* scanning */
 extern int spa_scan(spa_t *spa, pool_scan_func_t func);
 extern int spa_scan_stop(spa_t *spa);
 extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag);
 
 /* spa syncing */
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
 extern void spa_sync_allpools(void);
 
 extern uint_t zfs_sync_pass_deferred_free;
 
 /* spa sync taskqueues */
 taskq_t *spa_sync_tq_create(spa_t *spa, const char *name);
 void spa_sync_tq_destroy(spa_t *spa);
 void spa_select_allocator(zio_t *zio);
 
 /* spa namespace global mutex */
 extern kmutex_t spa_namespace_lock;
 
 /*
  * SPA configuration functions in spa_config.c
  */
 
 #define	SPA_CONFIG_UPDATE_POOL	0
 #define	SPA_CONFIG_UPDATE_VDEVS	1
 
 extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t);
 extern void spa_config_load(void);
 extern int spa_all_configs(uint64_t *generation, nvlist_t **pools);
 extern void spa_config_set(spa_t *spa, nvlist_t *config);
 extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
     int getstats);
 extern void spa_config_update(spa_t *spa, int what);
 extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv,
     vdev_t *parent, uint_t id, int atype);
 
 
 /*
  * Miscellaneous SPA routines in spa_misc.c
  */
 
 /* Namespace manipulation */
 extern spa_t *spa_lookup(const char *name);
 extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
 extern void spa_remove(spa_t *spa);
 extern spa_t *spa_next(spa_t *prev);
 
 /* Refcount functions */
 extern void spa_open_ref(spa_t *spa, const void *tag);
 extern void spa_close(spa_t *spa, const void *tag);
 extern void spa_async_close(spa_t *spa, const void *tag);
 extern boolean_t spa_refcount_zero(spa_t *spa);
 
 #define	SCL_NONE	0x00
 #define	SCL_CONFIG	0x01
 #define	SCL_STATE	0x02
 #define	SCL_L2ARC	0x04		/* hack until L2ARC 2.0 */
 #define	SCL_ALLOC	0x08
 #define	SCL_ZIO		0x10
 #define	SCL_FREE	0x20
 #define	SCL_VDEV	0x40
 #define	SCL_LOCKS	7
 #define	SCL_ALL		((1 << SCL_LOCKS) - 1)
 #define	SCL_STATE_ALL	(SCL_STATE | SCL_L2ARC | SCL_ZIO)
 
 /* Historical pool statistics */
 typedef struct spa_history_kstat {
 	kmutex_t		lock;
 	uint64_t		count;
 	uint64_t		size;
 	kstat_t			*kstat;
 	void			*priv;
 	list_t			list;
 } spa_history_kstat_t;
 
 typedef struct spa_history_list {
 	uint64_t		size;
 	procfs_list_t		procfs_list;
 } spa_history_list_t;
 
 typedef struct spa_stats {
 	spa_history_list_t	read_history;
 	spa_history_list_t	txg_history;
 	spa_history_kstat_t	tx_assign_histogram;
 	spa_history_list_t	mmp_history;
 	spa_history_kstat_t	state;		/* pool state */
 	spa_history_kstat_t	guid;		/* pool guid */
 	spa_history_kstat_t	iostats;
 } spa_stats_t;
 
 typedef enum txg_state {
 	TXG_STATE_BIRTH		= 0,
 	TXG_STATE_OPEN		= 1,
 	TXG_STATE_QUIESCED	= 2,
 	TXG_STATE_WAIT_FOR_SYNC	= 3,
 	TXG_STATE_SYNCED	= 4,
 	TXG_STATE_COMMITTED	= 5,
 } txg_state_t;
 
 typedef struct txg_stat {
 	vdev_stat_t		vs1;
 	vdev_stat_t		vs2;
 	uint64_t		txg;
 	uint64_t		ndirty;
 } txg_stat_t;
 
 /* Assorted pool IO kstats */
 typedef struct spa_iostats {
 	kstat_named_t	trim_extents_written;
 	kstat_named_t	trim_bytes_written;
 	kstat_named_t	trim_extents_skipped;
 	kstat_named_t	trim_bytes_skipped;
 	kstat_named_t	trim_extents_failed;
 	kstat_named_t	trim_bytes_failed;
 	kstat_named_t	autotrim_extents_written;
 	kstat_named_t	autotrim_bytes_written;
 	kstat_named_t	autotrim_extents_skipped;
 	kstat_named_t	autotrim_bytes_skipped;
 	kstat_named_t	autotrim_extents_failed;
 	kstat_named_t	autotrim_bytes_failed;
 	kstat_named_t	simple_trim_extents_written;
 	kstat_named_t	simple_trim_bytes_written;
 	kstat_named_t	simple_trim_extents_skipped;
 	kstat_named_t	simple_trim_bytes_skipped;
 	kstat_named_t	simple_trim_extents_failed;
 	kstat_named_t	simple_trim_bytes_failed;
 } spa_iostats_t;
 
 extern void spa_stats_init(spa_t *spa);
 extern void spa_stats_destroy(spa_t *spa);
 extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb,
     uint32_t aflags);
 extern void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time);
 extern int spa_txg_history_set(spa_t *spa,  uint64_t txg,
     txg_state_t completed_state, hrtime_t completed_time);
 extern txg_stat_t *spa_txg_history_init_io(spa_t *, uint64_t,
     struct dsl_pool *);
 extern void spa_txg_history_fini_io(spa_t *, txg_stat_t *);
 extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs);
 extern int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id);
 extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error,
     hrtime_t duration);
 extern void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
     uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id,
     int error);
 extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type,
     uint64_t extents_written, uint64_t bytes_written,
     uint64_t extents_skipped, uint64_t bytes_skipped,
     uint64_t extents_failed, uint64_t bytes_failed);
 extern void spa_import_progress_add(spa_t *spa);
 extern void spa_import_progress_remove(uint64_t spa_guid);
 extern int spa_import_progress_set_mmp_check(uint64_t pool_guid,
     uint64_t mmp_sec_remaining);
 extern int spa_import_progress_set_max_txg(uint64_t pool_guid,
     uint64_t max_txg);
 extern int spa_import_progress_set_state(uint64_t pool_guid,
     spa_load_state_t spa_load_state);
 extern void spa_import_progress_set_notes(spa_t *spa,
     const char *fmt, ...) __printflike(2, 3);
 extern void spa_import_progress_set_notes_nolog(spa_t *spa,
     const char *fmt, ...) __printflike(2, 3);
 
 /* Pool configuration locks */
 extern int spa_config_tryenter(spa_t *spa, int locks, const void *tag,
     krw_t rw);
 extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw);
 extern void spa_config_enter_mmp(spa_t *spa, int locks, const void *tag,
     krw_t rw);
 extern void spa_config_exit(spa_t *spa, int locks, const void *tag);
 extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
 
 /* Pool vdev add/remove lock */
 extern uint64_t spa_vdev_enter(spa_t *spa);
 extern uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid);
 extern uint64_t spa_vdev_config_enter(spa_t *spa);
 extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
     int error, const char *tag);
 extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
 
 /* Pool vdev state change lock */
 extern void spa_vdev_state_enter(spa_t *spa, int oplock);
 extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
 
 /* Log state */
 typedef enum spa_log_state {
 	SPA_LOG_UNKNOWN = 0,	/* unknown log state */
 	SPA_LOG_MISSING,	/* missing log(s) */
 	SPA_LOG_CLEAR,		/* clear the log(s) */
 	SPA_LOG_GOOD,		/* log(s) are good */
 } spa_log_state_t;
 
 extern spa_log_state_t spa_get_log_state(spa_t *spa);
 extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
 extern int spa_reset_logs(spa_t *spa);
 
 /* Log claim callback */
 extern void spa_claim_notify(zio_t *zio);
 extern void spa_deadman(void *);
 
 /* Accessor functions */
 extern boolean_t spa_shutting_down(spa_t *spa);
 extern struct dsl_pool *spa_get_dsl(spa_t *spa);
 extern boolean_t spa_is_initializing(spa_t *spa);
 extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa);
 extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
 extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
 extern void spa_altroot(spa_t *, char *, size_t);
 extern uint32_t spa_sync_pass(spa_t *spa);
 extern char *spa_name(spa_t *spa);
 extern uint64_t spa_guid(spa_t *spa);
 extern uint64_t spa_load_guid(spa_t *spa);
 extern uint64_t spa_last_synced_txg(spa_t *spa);
 extern uint64_t spa_first_txg(spa_t *spa);
 extern uint64_t spa_syncing_txg(spa_t *spa);
 extern uint64_t spa_final_dirty_txg(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern pool_state_t spa_state(spa_t *spa);
 extern spa_load_state_t spa_load_state(spa_t *spa);
 extern uint64_t spa_freeze_txg(spa_t *spa);
 extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize);
 extern uint64_t spa_get_dspace(spa_t *spa);
 extern uint64_t spa_get_checkpoint_space(spa_t *spa);
 extern uint64_t spa_get_slop_space(spa_t *spa);
 extern void spa_update_dspace(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern boolean_t spa_deflate(spa_t *spa);
 extern metaslab_class_t *spa_normal_class(spa_t *spa);
 extern metaslab_class_t *spa_log_class(spa_t *spa);
 extern metaslab_class_t *spa_embedded_log_class(spa_t *spa);
 extern metaslab_class_t *spa_special_class(spa_t *spa);
 extern metaslab_class_t *spa_dedup_class(spa_t *spa);
 extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size,
     dmu_object_type_t objtype, uint_t level, uint_t special_smallblk);
 
 extern void spa_evicting_os_register(spa_t *, objset_t *os);
 extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
 extern void spa_evicting_os_wait(spa_t *spa);
 extern int spa_max_replication(spa_t *spa);
 extern int spa_prev_software_version(spa_t *spa);
 extern uint64_t spa_get_failmode(spa_t *spa);
 extern uint64_t spa_get_deadman_failmode(spa_t *spa);
 extern void spa_set_deadman_failmode(spa_t *spa, const char *failmode);
 extern boolean_t spa_suspended(spa_t *spa);
 extern uint64_t spa_bootfs(spa_t *spa);
 extern uint64_t spa_delegation(spa_t *spa);
 extern objset_t *spa_meta_objset(spa_t *spa);
 extern space_map_t *spa_syncing_log_sm(spa_t *spa);
 extern uint64_t spa_deadman_synctime(spa_t *spa);
 extern uint64_t spa_deadman_ziotime(spa_t *spa);
 extern uint64_t spa_dirty_data(spa_t *spa);
 extern spa_autotrim_t spa_get_autotrim(spa_t *spa);
 extern int spa_get_allocator(spa_t *spa);
 extern void spa_set_allocator(spa_t *spa, const char *allocator);
 
 /* Miscellaneous support routines */
 extern void spa_load_failed(spa_t *spa, const char *fmt, ...)
     __attribute__((format(printf, 2, 3)));
 extern void spa_load_note(spa_t *spa, const char *fmt, ...)
     __attribute__((format(printf, 2, 3)));
 extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
     dmu_tx_t *tx);
 extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature);
 extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
 extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
 extern char *spa_strdup(const char *);
 extern void spa_strfree(char *);
 extern uint64_t spa_generate_guid(spa_t *spa);
 extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
 extern int spa_change_guid(spa_t *spa);
 extern void spa_upgrade(spa_t *spa, uint64_t version);
 extern void spa_evict_all(void);
 extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
     boolean_t l2cache);
 extern boolean_t spa_has_l2cache(spa_t *, uint64_t guid);
 extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
 extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
 extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
 extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
 extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
 extern boolean_t spa_has_pending_synctask(spa_t *spa);
 extern int spa_maxblocksize(spa_t *spa);
 extern int spa_maxdnodesize(spa_t *spa);
 extern boolean_t spa_has_checkpoint(spa_t *spa);
 extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa);
 extern boolean_t spa_suspend_async_destroy(spa_t *spa);
 extern uint64_t spa_min_claim_txg(spa_t *spa);
 extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva,
     const blkptr_t *bp);
 typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size,
     void *arg);
 extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp,
     spa_remap_cb_t callback, void *arg);
 extern uint64_t spa_get_last_removal_txg(spa_t *spa);
 extern boolean_t spa_trust_config(spa_t *spa);
 extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
 extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
 extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
 extern uint64_t spa_total_metaslabs(spa_t *spa);
 extern boolean_t spa_multihost(spa_t *spa);
 extern uint32_t spa_get_hostid(spa_t *spa);
 extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
 extern boolean_t spa_livelist_delete_check(spa_t *spa);
 
 extern spa_mode_t spa_mode(spa_t *spa);
 extern uint64_t zfs_strtonum(const char *str, char **nptr);
 
 extern char *spa_his_ievent_table[];
 
 extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx);
 extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
     char *his_buf);
 extern int spa_history_log(spa_t *spa, const char *his_buf);
 extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl);
 extern void spa_history_log_version(spa_t *spa, const char *operation,
     dmu_tx_t *tx);
 extern void spa_history_log_internal(spa_t *spa, const char *operation,
     dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5);
 extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op,
     dmu_tx_t *tx, const char *fmt, ...)  __printflike(4, 5);
 extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
     dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5);
 
 extern const char *spa_state_to_name(spa_t *spa);
 
 /* error handling */
 struct zbookmark_phys;
 extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb,
-    const uint64_t *birth);
+    const uint64_t birth);
 extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb,
-    const uint64_t *birth);
+    uint64_t birth);
 extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, zio_t *zio, uint64_t state);
 extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd,
     zio_t *zio);
 extern void zfs_ereport_taskq_fini(void);
 extern void zfs_ereport_clear(spa_t *spa, vdev_t *vd);
 extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type,
     const char *name, nvlist_t *aux);
 extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
 extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate);
 extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
 extern uint64_t spa_approx_errlog_size(spa_t *spa);
 extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count);
 extern uint64_t spa_get_last_errlog_size(spa_t *spa);
 extern void spa_errlog_rotate(spa_t *spa);
 extern void spa_errlog_drain(spa_t *spa);
 extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
 extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
 extern void spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx);
 extern void spa_swap_errlog(spa_t *spa, uint64_t new_head_ds,
     uint64_t old_head_ds, dmu_tx_t *tx);
 extern void sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj,
     dmu_tx_t *tx);
 extern void spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx);
 extern int find_top_affected_fs(spa_t *spa, uint64_t head_ds,
     zbookmark_err_phys_t *zep, uint64_t *top_affected_fs);
 extern int find_birth_txg(struct dsl_dataset *ds, zbookmark_err_phys_t *zep,
     uint64_t *birth_txg);
 extern void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep,
     zbookmark_phys_t *zb);
 extern void name_to_errphys(char *buf, zbookmark_err_phys_t *zep);
 
 /* vdev mirror */
 extern void vdev_mirror_stat_init(void);
 extern void vdev_mirror_stat_fini(void);
 
 /* Initialization and termination */
 extern void spa_init(spa_mode_t mode);
 extern void spa_fini(void);
 extern void spa_boot_init(void);
 
 /* properties */
 extern int spa_prop_set(spa_t *spa, nvlist_t *nvp);
 extern int spa_prop_get(spa_t *spa, nvlist_t **nvp);
 extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
 extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t);
 
 /* asynchronous event notification */
 extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl,
     const char *name);
 extern void zfs_ereport_zvol_post(const char *subclass, const char *name,
     const char *device_name, const char *raw_name);
 
 /* waiting for pool activities to complete */
 extern int spa_wait(const char *pool, zpool_wait_activity_t activity,
     boolean_t *waited);
 extern int spa_wait_tag(const char *name, zpool_wait_activity_t activity,
     uint64_t tag, boolean_t *waited);
 extern void spa_notify_waiters(spa_t *spa);
 extern void spa_wake_waiters(spa_t *spa);
 
 extern void spa_import_os(spa_t *spa);
 extern void spa_export_os(spa_t *spa);
 extern void spa_activate_os(spa_t *spa);
 extern void spa_deactivate_os(spa_t *spa);
 
 /* module param call functions */
 int param_set_deadman_ziotime(ZFS_MODULE_PARAM_ARGS);
 int param_set_deadman_synctime(ZFS_MODULE_PARAM_ARGS);
 int param_set_slop_shift(ZFS_MODULE_PARAM_ARGS);
 int param_set_deadman_failmode(ZFS_MODULE_PARAM_ARGS);
 int param_set_active_allocator(ZFS_MODULE_PARAM_ARGS);
 
 #ifdef ZFS_DEBUG
 #define	dprintf_bp(bp, fmt, ...) do {				\
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
 	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp));	\
 	dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf);		\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
 	} \
 } while (0)
 #else
 #define	dprintf_bp(bp, fmt, ...)
 #endif
 
 extern spa_mode_t spa_mode_global;
 extern int zfs_deadman_enabled;
 extern uint64_t zfs_deadman_synctime_ms;
 extern uint64_t zfs_deadman_ziotime_ms;
 extern uint64_t zfs_deadman_checktime_ms;
 
 extern kmem_cache_t *zio_buf_cache[];
 extern kmem_cache_t *zio_data_buf_cache[];
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_SPA_H */
diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h
index d3a71cc8f84b..1736b32cd3c6 100644
--- a/include/sys/uberblock_impl.h
+++ b/include/sys/uberblock_impl.h
@@ -1,180 +1,180 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_UBERBLOCK_IMPL_H
 #define	_SYS_UBERBLOCK_IMPL_H
 
 #include <sys/uberblock.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * The uberblock version is incremented whenever an incompatible on-disk
  * format change is made to the SPA, DMU, or ZAP.
  *
  * Note: the first two fields should never be moved.  When a storage pool
  * is opened, the uberblock must be read off the disk before the version
  * can be checked.  If the ub_version field is moved, we may not detect
  * version mismatch.  If the ub_magic field is moved, applications that
  * expect the magic number in the first word won't work.
  */
 #define	UBERBLOCK_MAGIC		0x00bab10c		/* oo-ba-bloc!	*/
 #define	UBERBLOCK_SHIFT		10			/* up to 1K	*/
 #define	MMP_MAGIC		0xa11cea11		/* all-see-all	*/
 
 #define	MMP_INTERVAL_VALID_BIT	0x01
 #define	MMP_SEQ_VALID_BIT	0x02
 #define	MMP_FAIL_INT_VALID_BIT	0x04
 
 #define	MMP_VALID(ubp)		(ubp->ub_magic == UBERBLOCK_MAGIC && \
 				    ubp->ub_mmp_magic == MMP_MAGIC)
 #define	MMP_INTERVAL_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \
 				    MMP_INTERVAL_VALID_BIT))
 #define	MMP_SEQ_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \
 				    MMP_SEQ_VALID_BIT))
 #define	MMP_FAIL_INT_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \
 				    MMP_FAIL_INT_VALID_BIT))
 
 #define	MMP_INTERVAL(ubp)	((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
 				    >> 8)
 #define	MMP_SEQ(ubp)		((ubp->ub_mmp_config & 0x0000FFFF00000000) \
 				    >> 32)
 #define	MMP_FAIL_INT(ubp)	((ubp->ub_mmp_config & 0xFFFF000000000000) \
 				    >> 48)
 
 #define	MMP_INTERVAL_SET(write) \
 	    (((uint64_t)(write & 0xFFFFFF) << 8) | MMP_INTERVAL_VALID_BIT)
 
 #define	MMP_SEQ_SET(seq) \
 	    (((uint64_t)(seq & 0xFFFF) << 32) | MMP_SEQ_VALID_BIT)
 
 #define	MMP_FAIL_INT_SET(fail) \
 	    (((uint64_t)(fail & 0xFFFF) << 48) | MMP_FAIL_INT_VALID_BIT)
 
 /*
  * RAIDZ expansion reflow information.
  *
  *	64      56      48      40      32      24      16      8       0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  *	|Scratch |                    Reflow                            |
  *	| State  |                    Offset                            |
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  */
 typedef enum raidz_reflow_scratch_state {
 	RRSS_SCRATCH_NOT_IN_USE = 0,
 	RRSS_SCRATCH_VALID,
 	RRSS_SCRATCH_INVALID_SYNCED,
 	RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT,
 	RRSS_SCRATCH_INVALID_SYNCED_REFLOW
 } raidz_reflow_scratch_state_t;
 
 #define	RRSS_GET_OFFSET(ub) \
 	BF64_GET_SB((ub)->ub_raidz_reflow_info, 0, 55, SPA_MINBLOCKSHIFT, 0)
 #define	RRSS_SET_OFFSET(ub, x) \
 	BF64_SET_SB((ub)->ub_raidz_reflow_info, 0, 55, SPA_MINBLOCKSHIFT, 0, x)
 
 #define	RRSS_GET_STATE(ub) \
 	BF64_GET((ub)->ub_raidz_reflow_info, 55, 9)
 #define	RRSS_SET_STATE(ub, x) \
 	BF64_SET((ub)->ub_raidz_reflow_info, 55, 9, x)
 
 #define	RAIDZ_REFLOW_SET(ub, state, offset) do { \
 	(ub)->ub_raidz_reflow_info = 0; \
 	RRSS_SET_OFFSET(ub, offset); \
 	RRSS_SET_STATE(ub, state); \
 } while (0)
 
 struct uberblock {
 	uint64_t	ub_magic;	/* UBERBLOCK_MAGIC		*/
 	uint64_t	ub_version;	/* SPA_VERSION			*/
 	uint64_t	ub_txg;		/* txg of last sync		*/
 	uint64_t	ub_guid_sum;	/* sum of all vdev guids	*/
 	uint64_t	ub_timestamp;	/* UTC time of last sync	*/
 	blkptr_t	ub_rootbp;	/* MOS objset_phys_t		*/
 
 	/* highest SPA_VERSION supported by software that wrote this txg */
 	uint64_t	ub_software_version;
 
 	/* Maybe missing in uberblocks we read, but always written */
 	uint64_t	ub_mmp_magic;	/* MMP_MAGIC			*/
 	/*
 	 * If ub_mmp_delay == 0 and ub_mmp_magic is valid, MMP is off.
 	 * Otherwise, nanosec since last MMP write.
 	 */
 	uint64_t	ub_mmp_delay;
 
 	/*
 	 * The ub_mmp_config contains the multihost write interval, multihost
 	 * fail intervals, sequence number for sub-second granularity, and
 	 * valid bit mask.  This layout is as follows:
 	 *
 	 *   64      56      48      40      32      24      16      8       0
 	 *   +-------+-------+-------+-------+-------+-------+-------+-------+
 	 * 0 | Fail Intervals|      Seq      |   Write Interval (ms) | VALID |
 	 *   +-------+-------+-------+-------+-------+-------+-------+-------+
 	 *
 	 * This allows a write_interval of (2^24/1000)s, over 4.5 hours
 	 *
 	 * VALID Bits:
 	 * - 0x01 - Write Interval (ms)
 	 * - 0x02 - Sequence number exists
 	 * - 0x04 - Fail Intervals
 	 * - 0xf8 - Reserved
 	 */
 	uint64_t	ub_mmp_config;
 
 	/*
 	 * ub_checkpoint_txg indicates two things about the current uberblock:
 	 *
 	 * 1] If it is not zero then this uberblock is a checkpoint. If it is
 	 *    zero, then this uberblock is not a checkpoint.
 	 *
 	 * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is
 	 *    the ub_txg that the uberblock had at the time we moved it to
 	 *    the MOS config.
 	 *
 	 * The field is set when we checkpoint the uberblock and continues to
 	 * hold that value even after we've rewound (unlike the ub_txg that
 	 * is reset to a higher value).
 	 *
 	 * Besides checks used to determine whether we are reopening the
 	 * pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
 	 * the value of the field is used to determine which ZIL blocks have
 	 * been allocated according to the ms_sm when we are rewinding to a
-	 * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
+	 * checkpoint. Specifically, if logical birth > ub_checkpoint_txg,then
 	 * the ZIL block is not allocated [see uses of spa_min_claim_txg()].
 	 */
 	uint64_t	ub_checkpoint_txg;
 
 	uint64_t	ub_raidz_reflow_info;
 };
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_UBERBLOCK_IMPL_H */
diff --git a/lib/libzdb/libzdb.c b/lib/libzdb/libzdb.c
index 9989fa1eb80f..12144dc65e75 100644
--- a/lib/libzdb/libzdb.c
+++ b/lib/libzdb/libzdb.c
@@ -1,102 +1,102 @@
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <getopt.h>
 #include <openssl/evp.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_sa.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/dbuf.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/stat.h>
 #include <sys/resource.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_traverse.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/zfs_fuid.h>
 #include <sys/arc.h>
 #include <sys/arc_impl.h>
 #include <sys/ddt.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/blkptr.h>
 #include <sys/dsl_crypt.h>
 #include <sys/dsl_scan.h>
 #include <sys/btree.h>
 #include <sys/brt.h>
 #include <sys/brt_impl.h>
 #include <zfs_comutil.h>
 #include <sys/zstd/zstd.h>
 
 #include <libnvpair.h>
 #include <libzutil.h>
 
 #include <libzdb.h>
 
 const char *
 zdb_ot_name(dmu_object_type_t type)
 {
 	if (type < DMU_OT_NUMTYPES)
 		return (dmu_ot[type].ot_name);
 	else if ((type & DMU_OT_NEWTYPE) &&
 	    ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS))
 		return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name);
 	else
 		return ("UNKNOWN");
 }
 
 int
 livelist_compare(const void *larg, const void *rarg)
 {
 	const blkptr_t *l = larg;
 	const blkptr_t *r = rarg;
 
 	/* Sort them according to dva[0] */
 	uint64_t l_dva0_vdev, r_dva0_vdev;
 	l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
 	r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
 	if (l_dva0_vdev < r_dva0_vdev)
 		return (-1);
 	else if (l_dva0_vdev > r_dva0_vdev)
 		return (+1);
 
 	/* if vdevs are equal, sort by offsets. */
 	uint64_t l_dva0_offset;
 	uint64_t r_dva0_offset;
 	l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
 	r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
 	if (l_dva0_offset < r_dva0_offset) {
 		return (-1);
 	} else if (l_dva0_offset > r_dva0_offset) {
 		return (+1);
 	}
 
 	/*
 	 * Since we're storing blkptrs without cancelling FREE/ALLOC pairs,
 	 * it's possible the offsets are equal. In that case, sort by txg
 	 */
-	if (l->blk_birth < r->blk_birth) {
+	if (BP_GET_LOGICAL_BIRTH(l) < BP_GET_LOGICAL_BIRTH(r)) {
 		return (-1);
-	} else if (l->blk_birth > r->blk_birth) {
+	} else if (BP_GET_LOGICAL_BIRTH(l) > BP_GET_LOGICAL_BIRTH(r)) {
 		return (+1);
 	}
 	return (0);
 }
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 3bcffb3c7ede..b1bcac6c44bc 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -1,10762 +1,10762 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2018, Joyent, Inc.
  * Copyright (c) 2011, 2020, Delphix. All rights reserved.
  * Copyright (c) 2014, Saso Kiselkov. All rights reserved.
  * Copyright (c) 2017, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2020, George Amanakis. All rights reserved.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2020, The FreeBSD Foundation [1]
  *
  * [1] Portions of this software were developed by Allan Jude
  *     under sponsorship from the FreeBSD Foundation.
  */
 
 /*
  * DVA-based Adjustable Replacement Cache
  *
  * While much of the theory of operation used here is
  * based on the self-tuning, low overhead replacement cache
  * presented by Megiddo and Modha at FAST 2003, there are some
  * significant differences:
  *
  * 1. The Megiddo and Modha model assumes any page is evictable.
  * Pages in its cache cannot be "locked" into memory.  This makes
  * the eviction algorithm simple: evict the last page in the list.
  * This also make the performance characteristics easy to reason
  * about.  Our cache is not so simple.  At any given moment, some
  * subset of the blocks in the cache are un-evictable because we
  * have handed out a reference to them.  Blocks are only evictable
  * when there are no external references active.  This makes
  * eviction far more problematic:  we choose to evict the evictable
  * blocks that are the "lowest" in the list.
  *
  * There are times when it is not possible to evict the requested
  * space.  In these circumstances we are unable to adjust the cache
  * size.  To prevent the cache growing unbounded at these times we
  * implement a "cache throttle" that slows the flow of new data
  * into the cache until we can make space available.
  *
  * 2. The Megiddo and Modha model assumes a fixed cache size.
  * Pages are evicted when the cache is full and there is a cache
  * miss.  Our model has a variable sized cache.  It grows with
  * high use, but also tries to react to memory pressure from the
  * operating system: decreasing its size when system memory is
  * tight.
  *
  * 3. The Megiddo and Modha model assumes a fixed page size. All
  * elements of the cache are therefore exactly the same size.  So
  * when adjusting the cache size following a cache miss, its simply
  * a matter of choosing a single page to evict.  In our model, we
  * have variable sized cache blocks (ranging from 512 bytes to
  * 128K bytes).  We therefore choose a set of blocks to evict to make
  * space for a cache miss that approximates as closely as possible
  * the space used by the new block.
  *
  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  * by N. Megiddo & D. Modha, FAST 2003
  */
 
 /*
  * The locking model:
  *
  * A new reference to a cache buffer can be obtained in two
  * ways: 1) via a hash table lookup using the DVA as a key,
  * or 2) via one of the ARC lists.  The arc_read() interface
  * uses method 1, while the internal ARC algorithms for
  * adjusting the cache use method 2.  We therefore provide two
  * types of locks: 1) the hash table lock array, and 2) the
  * ARC list locks.
  *
  * Buffers do not have their own mutexes, rather they rely on the
  * hash table mutexes for the bulk of their protection (i.e. most
  * fields in the arc_buf_hdr_t are protected by these mutexes).
  *
  * buf_hash_find() returns the appropriate mutex (held) when it
  * locates the requested buffer in the hash table.  It returns
  * NULL for the mutex if the buffer was not in the table.
  *
  * buf_hash_remove() expects the appropriate hash mutex to be
  * already held before it is invoked.
  *
  * Each ARC state also has a mutex which is used to protect the
  * buffer list associated with the state.  When attempting to
  * obtain a hash table lock while holding an ARC list lock you
  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  * the active state mutex must be held before the ghost state mutex.
  *
  * It as also possible to register a callback which is run when the
  * metadata limit is reached and no buffers can be safely evicted.  In
  * this case the arc user should drop a reference on some arc buffers so
  * they can be reclaimed.  For example, when using the ZPL each dentry
  * holds a references on a znode.  These dentries must be pruned before
  * the arc buffer holding the znode can be safely evicted.
  *
  * Note that the majority of the performance stats are manipulated
  * with atomic operations.
  *
  * The L2ARC uses the l2ad_mtx on each vdev for the following:
  *
  *	- L2ARC buflist creation
  *	- L2ARC buflist eviction
  *	- L2ARC write completion, which walks L2ARC buflists
  *	- ARC header destruction, as it removes from L2ARC buflists
  *	- ARC header release, as it removes from L2ARC buflists
  */
 
 /*
  * ARC operation:
  *
  * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
  * This structure can point either to a block that is still in the cache or to
  * one that is only accessible in an L2 ARC device, or it can provide
  * information about a block that was recently evicted. If a block is
  * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
  * information to retrieve it from the L2ARC device. This information is
  * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
  * that is in this state cannot access the data directly.
  *
  * Blocks that are actively being referenced or have not been evicted
  * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
  * the arc_buf_hdr_t that will point to the data block in memory. A block can
  * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
  * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
  * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
  *
  * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
  * ability to store the physical data (b_pabd) associated with the DVA of the
  * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
  * it will match its on-disk compression characteristics. This behavior can be
  * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
  * compressed ARC functionality is disabled, the b_pabd will point to an
  * uncompressed version of the on-disk data.
  *
  * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
  * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
  * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
  * consumer. The ARC will provide references to this data and will keep it
  * cached until it is no longer in use. The ARC caches only the L1ARC's physical
  * data block and will evict any arc_buf_t that is no longer referenced. The
  * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
  * "overhead_size" kstat.
  *
  * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
  * compressed form. The typical case is that consumers will want uncompressed
  * data, and when that happens a new data buffer is allocated where the data is
  * decompressed for them to use. Currently the only consumer who wants
  * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
  * exists on disk. When this happens, the arc_buf_t's data buffer is shared
  * with the arc_buf_hdr_t.
  *
  * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
  * first one is owned by a compressed send consumer (and therefore references
  * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
  * used by any other consumer (and has its own uncompressed copy of the data
  * buffer).
  *
  *   arc_buf_hdr_t
  *   +-----------+
  *   | fields    |
  *   | common to |
  *   | L1- and   |
  *   | L2ARC     |
  *   +-----------+
  *   | l2arc_buf_hdr_t
  *   |           |
  *   +-----------+
  *   | l1arc_buf_hdr_t
  *   |           |              arc_buf_t
  *   | b_buf     +------------>+-----------+      arc_buf_t
  *   | b_pabd    +-+           |b_next     +---->+-----------+
  *   +-----------+ |           |-----------|     |b_next     +-->NULL
  *                 |           |b_comp = T |     +-----------+
  *                 |           |b_data     +-+   |b_comp = F |
  *                 |           +-----------+ |   |b_data     +-+
  *                 +->+------+               |   +-----------+ |
  *        compressed  |      |               |                 |
  *           data     |      |<--------------+                 | uncompressed
  *                    +------+          compressed,            |     data
  *                                        shared               +-->+------+
  *                                         data                    |      |
  *                                                                 |      |
  *                                                                 +------+
  *
  * When a consumer reads a block, the ARC must first look to see if the
  * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
  * arc_buf_t and either copies uncompressed data into a new data buffer from an
  * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
  * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
  * hdr is compressed and the desired compression characteristics of the
  * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
  * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
  * the last buffer in the hdr's b_buf list, however a shared compressed buf can
  * be anywhere in the hdr's list.
  *
  * The diagram below shows an example of an uncompressed ARC hdr that is
  * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
  * the last element in the buf list):
  *
  *                arc_buf_hdr_t
  *                +-----------+
  *                |           |
  *                |           |
  *                |           |
  *                +-----------+
  * l2arc_buf_hdr_t|           |
  *                |           |
  *                +-----------+
  * l1arc_buf_hdr_t|           |
  *                |           |                 arc_buf_t    (shared)
  *                |    b_buf  +------------>+---------+      arc_buf_t
  *                |           |             |b_next   +---->+---------+
  *                |  b_pabd   +-+           |---------|     |b_next   +-->NULL
  *                +-----------+ |           |         |     +---------+
  *                              |           |b_data   +-+   |         |
  *                              |           +---------+ |   |b_data   +-+
  *                              +->+------+             |   +---------+ |
  *                                 |      |             |               |
  *                   uncompressed  |      |             |               |
  *                        data     +------+             |               |
  *                                    ^                 +->+------+     |
  *                                    |       uncompressed |      |     |
  *                                    |           data     |      |     |
  *                                    |                    +------+     |
  *                                    +---------------------------------+
  *
  * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
  * since the physical block is about to be rewritten. The new data contents
  * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
  * it may compress the data before writing it to disk. The ARC will be called
  * with the transformed data and will memcpy the transformed on-disk block into
  * a newly allocated b_pabd. Writes are always done into buffers which have
  * either been loaned (and hence are new and don't have other readers) or
  * buffers which have been released (and hence have their own hdr, if there
  * were originally other readers of the buf's original hdr). This ensures that
  * the ARC only needs to update a single buf and its hdr after a write occurs.
  *
  * When the L2ARC is in use, it will also take advantage of the b_pabd. The
  * L2ARC will always write the contents of b_pabd to the L2ARC. This means
  * that when compressed ARC is enabled that the L2ARC blocks are identical
  * to the on-disk block in the main data pool. This provides a significant
  * advantage since the ARC can leverage the bp's checksum when reading from the
  * L2ARC to determine if the contents are valid. However, if the compressed
  * ARC is disabled, then the L2ARC's block must be transformed to look
  * like the physical block in the main data pool before comparing the
  * checksum and determining its validity.
  *
  * The L1ARC has a slightly different system for storing encrypted data.
  * Raw (encrypted + possibly compressed) data has a few subtle differences from
  * data that is just compressed. The biggest difference is that it is not
  * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded.
  * The other difference is that encryption cannot be treated as a suggestion.
  * If a caller would prefer compressed data, but they actually wind up with
  * uncompressed data the worst thing that could happen is there might be a
  * performance hit. If the caller requests encrypted data, however, we must be
  * sure they actually get it or else secret information could be leaked. Raw
  * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore,
  * may have both an encrypted version and a decrypted version of its data at
  * once. When a caller needs a raw arc_buf_t, it is allocated and the data is
  * copied out of this header. To avoid complications with b_pabd, raw buffers
  * cannot be shared.
  */
 
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/spa_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/zfs_refcount.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dsl_pool.h>
 #include <sys/multilist.h>
 #include <sys/abd.h>
 #include <sys/zil.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/callb.h>
 #include <sys/kstat.h>
 #include <sys/zthr.h>
 #include <zfs_fletcher.h>
 #include <sys/arc_impl.h>
 #include <sys/trace_zfs.h>
 #include <sys/aggsum.h>
 #include <sys/wmsum.h>
 #include <cityhash.h>
 #include <sys/vdev_trim.h>
 #include <sys/zfs_racct.h>
 #include <sys/zstd/zstd.h>
 
 #ifndef _KERNEL
 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 boolean_t arc_watch = B_FALSE;
 #endif
 
 /*
  * This thread's job is to keep enough free memory in the system, by
  * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves
  * arc_available_memory().
  */
 static zthr_t *arc_reap_zthr;
 
 /*
  * This thread's job is to keep arc_size under arc_c, by calling
  * arc_evict(), which improves arc_is_overflowing().
  */
 static zthr_t *arc_evict_zthr;
 static arc_buf_hdr_t **arc_state_evict_markers;
 static int arc_state_evict_marker_count;
 
 static kmutex_t arc_evict_lock;
 static boolean_t arc_evict_needed = B_FALSE;
 static clock_t arc_last_uncached_flush;
 
 /*
  * Count of bytes evicted since boot.
  */
 static uint64_t arc_evict_count;
 
 /*
  * List of arc_evict_waiter_t's, representing threads waiting for the
  * arc_evict_count to reach specific values.
  */
 static list_t arc_evict_waiters;
 
 /*
  * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of
  * the requested amount of data to be evicted.  For example, by default for
  * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation.
  * Since this is above 100%, it ensures that progress is made towards getting
  * arc_size under arc_c.  Since this is finite, it ensures that allocations
  * can still happen, even during the potentially long time that arc_size is
  * more than arc_c.
  */
 static uint_t zfs_arc_eviction_pct = 200;
 
 /*
  * The number of headers to evict in arc_evict_state_impl() before
  * dropping the sublist lock and evicting from another sublist. A lower
  * value means we're more likely to evict the "correct" header (i.e. the
  * oldest header in the arc state), but comes with higher overhead
  * (i.e. more invocations of arc_evict_state_impl()).
  */
 static uint_t zfs_arc_evict_batch_limit = 10;
 
 /* number of seconds before growing cache again */
 uint_t arc_grow_retry = 5;
 
 /*
  * Minimum time between calls to arc_kmem_reap_soon().
  */
 static const int arc_kmem_cache_reap_retry_ms = 1000;
 
 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
 static int zfs_arc_overflow_shift = 8;
 
 /* log2(fraction of arc to reclaim) */
 uint_t arc_shrink_shift = 7;
 
 /* percent of pagecache to reclaim arc to */
 #ifdef _KERNEL
 uint_t zfs_arc_pc_percent = 0;
 #endif
 
 /*
  * log2(fraction of ARC which must be free to allow growing).
  * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
  * when reading a new block into the ARC, we will evict an equal-sized block
  * from the ARC.
  *
  * This must be less than arc_shrink_shift, so that when we shrink the ARC,
  * we will still not allow it to grow.
  */
 uint_t		arc_no_grow_shift = 5;
 
 
 /*
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
  */
 static uint_t		arc_min_prefetch_ms;
 static uint_t		arc_min_prescient_prefetch_ms;
 
 /*
  * If this percent of memory is free, don't throttle.
  */
 uint_t arc_lotsfree_percent = 10;
 
 /*
  * The arc has filled available memory and has now warmed up.
  */
 boolean_t arc_warm;
 
 /*
  * These tunables are for performance analysis.
  */
 uint64_t zfs_arc_max = 0;
 uint64_t zfs_arc_min = 0;
 static uint64_t zfs_arc_dnode_limit = 0;
 static uint_t zfs_arc_dnode_reduce_percent = 10;
 static uint_t zfs_arc_grow_retry = 0;
 static uint_t zfs_arc_shrink_shift = 0;
 uint_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 
 /*
  * ARC dirty data constraints for arc_tempreserve_space() throttle:
  * * total dirty data limit
  * * anon block dirty limit
  * * each pool's anon allowance
  */
 static const unsigned long zfs_arc_dirty_limit_percent = 50;
 static const unsigned long zfs_arc_anon_limit_percent = 25;
 static const unsigned long zfs_arc_pool_dirty_percent = 20;
 
 /*
  * Enable or disable compressed arc buffers.
  */
 int zfs_compressed_arc_enabled = B_TRUE;
 
 /*
  * Balance between metadata and data on ghost hits.  Values above 100
  * increase metadata caching by proportionally reducing effect of ghost
  * data hits on target data/metadata rate.
  */
 static uint_t zfs_arc_meta_balance = 500;
 
 /*
  * Percentage that can be consumed by dnodes of ARC meta buffers.
  */
 static uint_t zfs_arc_dnode_limit_percent = 10;
 
 /*
  * These tunables are Linux-specific
  */
 static uint64_t zfs_arc_sys_free = 0;
 static uint_t zfs_arc_min_prefetch_ms = 0;
 static uint_t zfs_arc_min_prescient_prefetch_ms = 0;
 static uint_t zfs_arc_lotsfree_percent = 10;
 
 /*
  * Number of arc_prune threads
  */
 static int zfs_arc_prune_task_threads = 1;
 
 /* The 7 states: */
 arc_state_t ARC_anon;
 arc_state_t ARC_mru;
 arc_state_t ARC_mru_ghost;
 arc_state_t ARC_mfu;
 arc_state_t ARC_mfu_ghost;
 arc_state_t ARC_l2c_only;
 arc_state_t ARC_uncached;
 
 arc_stats_t arc_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
 	{ "iohits",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
 	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
 	{ "demand_data_iohits",		KSTAT_DATA_UINT64 },
 	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
 	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
 	{ "demand_metadata_iohits",	KSTAT_DATA_UINT64 },
 	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
 	{ "prefetch_data_iohits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_iohits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "mru_hits",			KSTAT_DATA_UINT64 },
 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "uncached_hits",		KSTAT_DATA_UINT64 },
 	{ "deleted",			KSTAT_DATA_UINT64 },
 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
 	{ "access_skip",		KSTAT_DATA_UINT64 },
 	{ "evict_skip",			KSTAT_DATA_UINT64 },
 	{ "evict_not_enough",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible_mfu",	KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible_mru",	KSTAT_DATA_UINT64 },
 	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
 	{ "evict_l2_skip",		KSTAT_DATA_UINT64 },
 	{ "hash_elements",		KSTAT_DATA_UINT64 },
 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
 	{ "hash_chains",		KSTAT_DATA_UINT64 },
 	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
 	{ "meta",			KSTAT_DATA_UINT64 },
 	{ "pd",				KSTAT_DATA_UINT64 },
 	{ "pm",				KSTAT_DATA_UINT64 },
 	{ "c",				KSTAT_DATA_UINT64 },
 	{ "c_min",			KSTAT_DATA_UINT64 },
 	{ "c_max",			KSTAT_DATA_UINT64 },
 	{ "size",			KSTAT_DATA_UINT64 },
 	{ "compressed_size",		KSTAT_DATA_UINT64 },
 	{ "uncompressed_size",		KSTAT_DATA_UINT64 },
 	{ "overhead_size",		KSTAT_DATA_UINT64 },
 	{ "hdr_size",			KSTAT_DATA_UINT64 },
 	{ "data_size",			KSTAT_DATA_UINT64 },
 	{ "metadata_size",		KSTAT_DATA_UINT64 },
 	{ "dbuf_size",			KSTAT_DATA_UINT64 },
 	{ "dnode_size",			KSTAT_DATA_UINT64 },
 	{ "bonus_size",			KSTAT_DATA_UINT64 },
 #if defined(COMPAT_FREEBSD11)
 	{ "other_size",			KSTAT_DATA_UINT64 },
 #endif
 	{ "anon_size",			KSTAT_DATA_UINT64 },
 	{ "anon_data",			KSTAT_DATA_UINT64 },
 	{ "anon_metadata",		KSTAT_DATA_UINT64 },
 	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mru_size",			KSTAT_DATA_UINT64 },
 	{ "mru_data",			KSTAT_DATA_UINT64 },
 	{ "mru_metadata",		KSTAT_DATA_UINT64 },
 	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
 	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
 	{ "mru_ghost_data",		KSTAT_DATA_UINT64 },
 	{ "mru_ghost_metadata",		KSTAT_DATA_UINT64 },
 	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "mfu_size",			KSTAT_DATA_UINT64 },
 	{ "mfu_data",			KSTAT_DATA_UINT64 },
 	{ "mfu_metadata",		KSTAT_DATA_UINT64 },
 	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
 	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_data",		KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_metadata",		KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "uncached_size",		KSTAT_DATA_UINT64 },
 	{ "uncached_data",		KSTAT_DATA_UINT64 },
 	{ "uncached_metadata",		KSTAT_DATA_UINT64 },
 	{ "uncached_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "uncached_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "l2_hits",			KSTAT_DATA_UINT64 },
 	{ "l2_misses",			KSTAT_DATA_UINT64 },
 	{ "l2_prefetch_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_mru_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_mfu_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_bufc_data_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_bufc_metadata_asize",	KSTAT_DATA_UINT64 },
 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
 	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
 	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
 	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
 	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
 	{ "l2_size",			KSTAT_DATA_UINT64 },
 	{ "l2_asize",			KSTAT_DATA_UINT64 },
 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_writes",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_avg_asize",	KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_count",		KSTAT_DATA_UINT64 },
 	{ "l2_data_to_meta_ratio",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_success",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_unsupported",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_io_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_dh_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_cksum_lb_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_size",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_bufs",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_bufs_precached",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_log_blks",	KSTAT_DATA_UINT64 },
 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
 	{ "memory_direct_count",	KSTAT_DATA_UINT64 },
 	{ "memory_indirect_count",	KSTAT_DATA_UINT64 },
 	{ "memory_all_bytes",		KSTAT_DATA_UINT64 },
 	{ "memory_free_bytes",		KSTAT_DATA_UINT64 },
 	{ "memory_available_bytes",	KSTAT_DATA_INT64 },
 	{ "arc_no_grow",		KSTAT_DATA_UINT64 },
 	{ "arc_tempreserve",		KSTAT_DATA_UINT64 },
 	{ "arc_loaned_bytes",		KSTAT_DATA_UINT64 },
 	{ "arc_prune",			KSTAT_DATA_UINT64 },
 	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
 	{ "arc_dnode_limit",		KSTAT_DATA_UINT64 },
 	{ "async_upgrade_sync",		KSTAT_DATA_UINT64 },
 	{ "predictive_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_iohit_predictive_prefetch", KSTAT_DATA_UINT64 },
 	{ "prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_iohit_prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "arc_need_free",		KSTAT_DATA_UINT64 },
 	{ "arc_sys_free",		KSTAT_DATA_UINT64 },
 	{ "arc_raw_size",		KSTAT_DATA_UINT64 },
 	{ "cached_only_in_progress",	KSTAT_DATA_UINT64 },
 	{ "abd_chunk_waste_size",	KSTAT_DATA_UINT64 },
 };
 
 arc_sums_t arc_sums;
 
 #define	ARCSTAT_MAX(stat, val) {					\
 	uint64_t m;							\
 	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
 	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
 		continue;						\
 }
 
 /*
  * We define a macro to allow ARC hits/misses to be easily broken down by
  * two separate conditions, giving a total of four different subtypes for
  * each of hits and misses (so eight statistics total).
  */
 #define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 	if (cond1) {							\
 		if (cond2) {						\
 			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 		} else {						\
 			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 		}							\
 	} else {							\
 		if (cond2) {						\
 			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 		} else {						\
 			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 		}							\
 	}
 
 /*
  * This macro allows us to use kstats as floating averages. Each time we
  * update this kstat, we first factor it and the update value by
  * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
  * average. This macro assumes that integer loads and stores are atomic, but
  * is not safe for multiple writers updating the kstat in parallel (only the
  * last writer's update will remain).
  */
 #define	ARCSTAT_F_AVG_FACTOR	3
 #define	ARCSTAT_F_AVG(stat, value) \
 	do { \
 		uint64_t x = ARCSTAT(stat); \
 		x = x - x / ARCSTAT_F_AVG_FACTOR + \
 		    (value) / ARCSTAT_F_AVG_FACTOR; \
 		ARCSTAT(stat) = x; \
 	} while (0)
 
 static kstat_t			*arc_ksp;
 
 /*
  * There are several ARC variables that are critical to export as kstats --
  * but we don't want to have to grovel around in the kstat whenever we wish to
  * manipulate them.  For these variables, we therefore define them to be in
  * terms of the statistic variable.  This assures that we are not introducing
  * the possibility of inconsistency by having shadow copies of the variables,
  * while still allowing the code to be readable.
  */
 #define	arc_tempreserve	ARCSTAT(arcstat_tempreserve)
 #define	arc_loaned_bytes	ARCSTAT(arcstat_loaned_bytes)
 #define	arc_dnode_limit	ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
 #define	arc_need_free	ARCSTAT(arcstat_need_free) /* waiting to be evicted */
 
 hrtime_t arc_growtime;
 list_t arc_prune_list;
 kmutex_t arc_prune_mtx;
 taskq_t *arc_prune_taskq;
 
 #define	GHOST_STATE(state)	\
 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
 	(state) == arc_l2c_only)
 
 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
 #define	HDR_PRESCIENT_PREFETCH(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
 #define	HDR_COMPRESSION_ENABLED(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
 
 #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
 #define	HDR_UNCACHED(hdr)	((hdr)->b_flags & ARC_FLAG_UNCACHED)
 #define	HDR_L2_READING(hdr)	\
 	(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
 	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
 #define	HDR_PROTECTED(hdr)	((hdr)->b_flags & ARC_FLAG_PROTECTED)
 #define	HDR_NOAUTH(hdr)		((hdr)->b_flags & ARC_FLAG_NOAUTH)
 #define	HDR_SHARED_DATA(hdr)	((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
 
 #define	HDR_ISTYPE_METADATA(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
 #define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
 
 #define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
 #define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 #define	HDR_HAS_RABD(hdr)	\
 	(HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) &&	\
 	(hdr)->b_crypt_hdr.b_rabd != NULL)
 #define	HDR_ENCRYPTED(hdr)	\
 	(HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
 #define	HDR_AUTHENTICATED(hdr)	\
 	(HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
 
 /* For storing compression mode in b_flags */
 #define	HDR_COMPRESS_OFFSET	(highbit64(ARC_FLAG_COMPRESS_0) - 1)
 
 #define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET((hdr)->b_flags, \
 	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
 #define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
 	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
 
 #define	ARC_BUF_LAST(buf)	((buf)->b_next == NULL)
 #define	ARC_BUF_SHARED(buf)	((buf)->b_flags & ARC_BUF_FLAG_SHARED)
 #define	ARC_BUF_COMPRESSED(buf)	((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
 #define	ARC_BUF_ENCRYPTED(buf)	((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED)
 
 /*
  * Other sizes
  */
 
 #define	HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 #define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 
 /*
  * Hash table routines
  */
 
 #define	BUF_LOCKS 2048
 typedef struct buf_hash_table {
 	uint64_t ht_mask;
 	arc_buf_hdr_t **ht_table;
 	kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned;
 } buf_hash_table_t;
 
 static buf_hash_table_t buf_hash_table;
 
 #define	BUF_HASH_INDEX(spa, dva, birth) \
 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 #define	BUF_HASH_LOCK(idx)	(&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 #define	HDR_LOCK(hdr) \
 	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 
 uint64_t zfs_crc64_table[256];
 
 /*
  * Level 2 ARC
  */
 
 #define	L2ARC_WRITE_SIZE	(32 * 1024 * 1024)	/* initial write max */
 #define	L2ARC_HEADROOM		8			/* num of writes */
 
 /*
  * If we discover during ARC scan any buffers to be compressed, we boost
  * our headroom for the next scanning cycle by this percentage multiple.
  */
 #define	L2ARC_HEADROOM_BOOST	200
 #define	L2ARC_FEED_SECS		1		/* caching interval secs */
 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
 
 /*
  * We can feed L2ARC from two states of ARC buffers, mru and mfu,
  * and each of the state has two types: data and metadata.
  */
 #define	L2ARC_FEED_TYPES	4
 
 /* L2ARC Performance Tunables */
 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
 uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* # of dev writes */
 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
 int l2arc_noprefetch = B_TRUE;			/* don't cache prefetch bufs */
 int l2arc_feed_again = B_TRUE;			/* turbo warmup */
 int l2arc_norw = B_FALSE;			/* no reads during writes */
 static uint_t l2arc_meta_percent = 33;	/* limit on headers size */
 
 /*
  * L2ARC Internals
  */
 static list_t L2ARC_dev_list;			/* device list */
 static list_t *l2arc_dev_list;			/* device list pointer */
 static kmutex_t l2arc_dev_mtx;			/* device list mutex */
 static l2arc_dev_t *l2arc_dev_last;		/* last device used */
 static list_t L2ARC_free_on_write;		/* free after write buf list */
 static list_t *l2arc_free_on_write;		/* free after write list ptr */
 static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
 static uint64_t l2arc_ndev;			/* number of devices */
 
 typedef struct l2arc_read_callback {
 	arc_buf_hdr_t		*l2rcb_hdr;		/* read header */
 	blkptr_t		l2rcb_bp;		/* original blkptr */
 	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
 	int			l2rcb_flags;		/* original flags */
 	abd_t			*l2rcb_abd;		/* temporary buffer */
 } l2arc_read_callback_t;
 
 typedef struct l2arc_data_free {
 	/* protected by l2arc_free_on_write_mtx */
 	abd_t		*l2df_abd;
 	size_t		l2df_size;
 	arc_buf_contents_t l2df_type;
 	list_node_t	l2df_list_node;
 } l2arc_data_free_t;
 
 typedef enum arc_fill_flags {
 	ARC_FILL_LOCKED		= 1 << 0, /* hdr lock is held */
 	ARC_FILL_COMPRESSED	= 1 << 1, /* fill with compressed data */
 	ARC_FILL_ENCRYPTED	= 1 << 2, /* fill with encrypted data */
 	ARC_FILL_NOAUTH		= 1 << 3, /* don't attempt to authenticate */
 	ARC_FILL_IN_PLACE	= 1 << 4  /* fill in place (special case) */
 } arc_fill_flags_t;
 
 typedef enum arc_ovf_level {
 	ARC_OVF_NONE,			/* ARC within target size. */
 	ARC_OVF_SOME,			/* ARC is slightly overflowed. */
 	ARC_OVF_SEVERE			/* ARC is severely overflowed. */
 } arc_ovf_level_t;
 
 static kmutex_t l2arc_feed_thr_lock;
 static kcondvar_t l2arc_feed_thr_cv;
 static uint8_t l2arc_thread_exit;
 
 static kmutex_t l2arc_rebuild_thr_lock;
 static kcondvar_t l2arc_rebuild_thr_cv;
 
 enum arc_hdr_alloc_flags {
 	ARC_HDR_ALLOC_RDATA = 0x1,
 	ARC_HDR_USE_RESERVE = 0x4,
 	ARC_HDR_ALLOC_LINEAR = 0x8,
 };
 
 
 static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, const void *, int);
 static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, const void *);
 static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, const void *, int);
 static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, const void *);
 static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, const void *);
 static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size,
     const void *tag);
 static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t);
 static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int);
 static void arc_hdr_destroy(arc_buf_hdr_t *);
 static void arc_access(arc_buf_hdr_t *, arc_flags_t, boolean_t);
 static void arc_buf_watch(arc_buf_t *);
 static void arc_change_state(arc_state_t *, arc_buf_hdr_t *);
 
 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
 static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 
 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 static void l2arc_read_done(zio_t *);
 static void l2arc_do_free_on_write(void);
 static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
     boolean_t state_only);
 
 static void arc_prune_async(uint64_t adjust);
 
 #define	l2arc_hdr_arcstats_increment(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
 #define	l2arc_hdr_arcstats_decrement(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE)
 #define	l2arc_hdr_arcstats_increment_state(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE)
 #define	l2arc_hdr_arcstats_decrement_state(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE)
 
 /*
  * l2arc_exclude_special : A zfs module parameter that controls whether buffers
  * 		present on special vdevs are eligibile for caching in L2ARC. If
  * 		set to 1, exclude dbufs on special vdevs from being cached to
  * 		L2ARC.
  */
 int l2arc_exclude_special = 0;
 
 /*
  * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
  * 		metadata and data are cached from ARC into L2ARC.
  */
 static int l2arc_mfuonly = 0;
 
 /*
  * L2ARC TRIM
  * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of
  * 		the current write size (l2arc_write_max) we should TRIM if we
  * 		have filled the device. It is defined as a percentage of the
  * 		write size. If set to 100 we trim twice the space required to
  * 		accommodate upcoming writes. A minimum of 64MB will be trimmed.
  * 		It also enables TRIM of the whole L2ARC device upon creation or
  * 		addition to an existing pool or if the header of the device is
  * 		invalid upon importing a pool or onlining a cache device. The
  * 		default is 0, which disables TRIM on L2ARC altogether as it can
  * 		put significant stress on the underlying storage devices. This
  * 		will vary depending of how well the specific device handles
  * 		these commands.
  */
 static uint64_t l2arc_trim_ahead = 0;
 
 /*
  * Performance tuning of L2ARC persistence:
  *
  * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
  * 		an L2ARC device (either at pool import or later) will attempt
  * 		to rebuild L2ARC buffer contents.
  * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls
  * 		whether log blocks are written to the L2ARC device. If the L2ARC
  * 		device is less than 1GB, the amount of data l2arc_evict()
  * 		evicts is significant compared to the amount of restored L2ARC
  * 		data. In this case do not write log blocks in L2ARC in order
  * 		not to waste space.
  */
 static int l2arc_rebuild_enabled = B_TRUE;
 static uint64_t l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
 
 /* L2ARC persistence rebuild control routines. */
 void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
 static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg);
 static int l2arc_rebuild(l2arc_dev_t *dev);
 
 /* L2ARC persistence read I/O routines. */
 static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
 static int l2arc_log_blk_read(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
     zio_t *this_io, zio_t **next_io);
 static zio_t *l2arc_log_blk_fetch(vdev_t *vd,
     const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb);
 static void l2arc_log_blk_fetch_abort(zio_t *zio);
 
 /* L2ARC persistence block restoration routines. */
 static void l2arc_log_blk_restore(l2arc_dev_t *dev,
     const l2arc_log_blk_phys_t *lb, uint64_t lb_asize);
 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
     l2arc_dev_t *dev);
 
 /* L2ARC persistence write I/O routines. */
 static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
     l2arc_write_callback_t *cb);
 
 /* L2ARC persistence auxiliary routines. */
 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *lbp);
 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
     const arc_buf_hdr_t *ab);
 boolean_t l2arc_range_check_overlap(uint64_t bottom,
     uint64_t top, uint64_t check);
 static void l2arc_blk_fetch_done(zio_t *zio);
 static inline uint64_t
     l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev);
 
 /*
  * We use Cityhash for this. It's fast, and has good hash properties without
  * requiring any large static buffers.
  */
 static uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
 	return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
 }
 
 #define	HDR_EMPTY(hdr)						\
 	((hdr)->b_dva.dva_word[0] == 0 &&			\
 	(hdr)->b_dva.dva_word[1] == 0)
 
 #define	HDR_EMPTY_OR_LOCKED(hdr)				\
 	(HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr)))
 
 #define	HDR_EQUAL(spa, dva, birth, hdr)				\
 	((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
 	((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
 	((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
 
 static void
 buf_discard_identity(arc_buf_hdr_t *hdr)
 {
 	hdr->b_dva.dva_word[0] = 0;
 	hdr->b_dva.dva_word[1] = 0;
 	hdr->b_birth = 0;
 }
 
 static arc_buf_hdr_t *
 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
-	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
+	uint64_t birth = BP_GET_BIRTH(bp);
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *hdr;
 
 	mutex_enter(hash_lock);
 	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
 	    hdr = hdr->b_hash_next) {
 		if (HDR_EQUAL(spa, dva, birth, hdr)) {
 			*lockp = hash_lock;
 			return (hdr);
 		}
 	}
 	mutex_exit(hash_lock);
 	*lockp = NULL;
 	return (NULL);
 }
 
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
  * If lockp == NULL, the caller is assumed to already hold the hash lock.
  */
 static arc_buf_hdr_t *
 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
 {
 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *fhdr;
 	uint32_t i;
 
 	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
 	ASSERT(hdr->b_birth != 0);
 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
 
 	if (lockp != NULL) {
 		*lockp = hash_lock;
 		mutex_enter(hash_lock);
 	} else {
 		ASSERT(MUTEX_HELD(hash_lock));
 	}
 
 	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
 	    fhdr = fhdr->b_hash_next, i++) {
 		if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
 			return (fhdr);
 	}
 
 	hdr->b_hash_next = buf_hash_table.ht_table[idx];
 	buf_hash_table.ht_table[idx] = hdr;
 	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
 	/* collect some hash table performance data */
 	if (i > 0) {
 		ARCSTAT_BUMP(arcstat_hash_collisions);
 		if (i == 1)
 			ARCSTAT_BUMP(arcstat_hash_chains);
 
 		ARCSTAT_MAX(arcstat_hash_chain_max, i);
 	}
 	uint64_t he = atomic_inc_64_nv(
 	    &arc_stats.arcstat_hash_elements.value.ui64);
 	ARCSTAT_MAX(arcstat_hash_elements_max, he);
 
 	return (NULL);
 }
 
 static void
 buf_hash_remove(arc_buf_hdr_t *hdr)
 {
 	arc_buf_hdr_t *fhdr, **hdrp;
 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 
 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 	ASSERT(HDR_IN_HASH_TABLE(hdr));
 
 	hdrp = &buf_hash_table.ht_table[idx];
 	while ((fhdr = *hdrp) != hdr) {
 		ASSERT3P(fhdr, !=, NULL);
 		hdrp = &fhdr->b_hash_next;
 	}
 	*hdrp = hdr->b_hash_next;
 	hdr->b_hash_next = NULL;
 	arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
 	/* collect some hash table performance data */
 	atomic_dec_64(&arc_stats.arcstat_hash_elements.value.ui64);
 
 	if (buf_hash_table.ht_table[idx] &&
 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 }
 
 /*
  * Global data structures and functions for the buf kmem cache.
  */
 
 static kmem_cache_t *hdr_full_cache;
 static kmem_cache_t *hdr_l2only_cache;
 static kmem_cache_t *buf_cache;
 
 static void
 buf_fini(void)
 {
 #if defined(_KERNEL)
 	/*
 	 * Large allocations which do not require contiguous pages
 	 * should be using vmem_free() in the linux kernel\
 	 */
 	vmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 #else
 	kmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 #endif
 	for (int i = 0; i < BUF_LOCKS; i++)
 		mutex_destroy(BUF_HASH_LOCK(i));
 	kmem_cache_destroy(hdr_full_cache);
 	kmem_cache_destroy(hdr_l2only_cache);
 	kmem_cache_destroy(buf_cache);
 }
 
 /*
  * Constructor callback - called when the cache is empty
  * and a new buf is requested.
  */
 static int
 hdr_full_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	arc_buf_hdr_t *hdr = vbuf;
 
 	memset(hdr, 0, HDR_FULL_SIZE);
 	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 	zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
 #ifdef ZFS_DEBUG
 	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 #endif
 	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
 	list_link_init(&hdr->b_l2hdr.b_l2node);
 	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 static int
 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	arc_buf_hdr_t *hdr = vbuf;
 
 	memset(hdr, 0, HDR_L2ONLY_SIZE);
 	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 
 	return (0);
 }
 
 static int
 buf_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	arc_buf_t *buf = vbuf;
 
 	memset(buf, 0, sizeof (arc_buf_t));
 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 /*
  * Destructor callback - called when a cached buf is
  * no longer required.
  */
 static void
 hdr_full_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	arc_buf_hdr_t *hdr = vbuf;
 
 	ASSERT(HDR_EMPTY(hdr));
 	zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
 #ifdef ZFS_DEBUG
 	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
 #endif
 	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 }
 
 static void
 hdr_l2only_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	arc_buf_hdr_t *hdr = vbuf;
 
 	ASSERT(HDR_EMPTY(hdr));
 	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 }
 
 static void
 buf_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	(void) vbuf;
 
 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
 static void
 buf_init(void)
 {
 	uint64_t *ct = NULL;
 	uint64_t hsize = 1ULL << 12;
 	int i, j;
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
 	 * with an average block size of zfs_arc_average_blocksize (default 8K).
 	 * By default, the table will take up
 	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
 	 */
 	while (hsize * zfs_arc_average_blocksize < arc_all_memory())
 		hsize <<= 1;
 retry:
 	buf_hash_table.ht_mask = hsize - 1;
 #if defined(_KERNEL)
 	/*
 	 * Large allocations which do not require contiguous pages
 	 * should be using vmem_alloc() in the linux kernel
 	 */
 	buf_hash_table.ht_table =
 	    vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
 #else
 	buf_hash_table.ht_table =
 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 #endif
 	if (buf_hash_table.ht_table == NULL) {
 		ASSERT(hsize > (1ULL << 8));
 		hsize >>= 1;
 		goto retry;
 	}
 
 	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
 	    0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0);
 	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
 	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
 	    NULL, NULL, 0);
 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 
 	for (i = 0; i < 256; i++)
 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 
 	for (i = 0; i < BUF_LOCKS; i++)
 		mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL);
 }
 
 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
 
 /*
  * This is the size that the buf occupies in memory. If the buf is compressed,
  * it will correspond to the compressed size. You should use this method of
  * getting the buf size unless you explicitly need the logical size.
  */
 uint64_t
 arc_buf_size(arc_buf_t *buf)
 {
 	return (ARC_BUF_COMPRESSED(buf) ?
 	    HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
 }
 
 uint64_t
 arc_buf_lsize(arc_buf_t *buf)
 {
 	return (HDR_GET_LSIZE(buf->b_hdr));
 }
 
 /*
  * This function will return B_TRUE if the buffer is encrypted in memory.
  * This buffer can be decrypted by calling arc_untransform().
  */
 boolean_t
 arc_is_encrypted(arc_buf_t *buf)
 {
 	return (ARC_BUF_ENCRYPTED(buf) != 0);
 }
 
 /*
  * Returns B_TRUE if the buffer represents data that has not had its MAC
  * verified yet.
  */
 boolean_t
 arc_is_unauthenticated(arc_buf_t *buf)
 {
 	return (HDR_NOAUTH(buf->b_hdr) != 0);
 }
 
 void
 arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
     uint8_t *iv, uint8_t *mac)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(HDR_PROTECTED(hdr));
 
 	memcpy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
 	memcpy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
 	memcpy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
 	*byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
 	    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
 }
 
 /*
  * Indicates how this buffer is compressed in memory. If it is not compressed
  * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with
  * arc_untransform() as long as it is also unencrypted.
  */
 enum zio_compress
 arc_get_compression(arc_buf_t *buf)
 {
 	return (ARC_BUF_COMPRESSED(buf) ?
 	    HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
 }
 
 /*
  * Return the compression algorithm used to store this data in the ARC. If ARC
  * compression is enabled or this is an encrypted block, this will be the same
  * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF.
  */
 static inline enum zio_compress
 arc_hdr_get_compress(arc_buf_hdr_t *hdr)
 {
 	return (HDR_COMPRESSION_ENABLED(hdr) ?
 	    HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF);
 }
 
 uint8_t
 arc_get_complevel(arc_buf_t *buf)
 {
 	return (buf->b_hdr->b_complevel);
 }
 
 static inline boolean_t
 arc_buf_is_shared(arc_buf_t *buf)
 {
 	boolean_t shared = (buf->b_data != NULL &&
 	    buf->b_hdr->b_l1hdr.b_pabd != NULL &&
 	    abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
 	    buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
 	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
 	EQUIV(shared, ARC_BUF_SHARED(buf));
 	IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
 
 	/*
 	 * It would be nice to assert arc_can_share() too, but the "hdr isn't
 	 * already being shared" requirement prevents us from doing that.
 	 */
 
 	return (shared);
 }
 
 /*
  * Free the checksum associated with this header. If there is no checksum, this
  * is a no-op.
  */
 static inline void
 arc_cksum_free(arc_buf_hdr_t *hdr)
 {
 #ifdef ZFS_DEBUG
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
 		kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
 		hdr->b_l1hdr.b_freeze_cksum = NULL;
 	}
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 #endif
 }
 
 /*
  * Return true iff at least one of the bufs on hdr is not compressed.
  * Encrypted buffers count as compressed.
  */
 static boolean_t
 arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
 {
 	ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr));
 
 	for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
 		if (!ARC_BUF_COMPRESSED(b)) {
 			return (B_TRUE);
 		}
 	}
 	return (B_FALSE);
 }
 
 
 /*
  * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
  * matches the checksum that is stored in the hdr. If there is no checksum,
  * or if the buf is compressed, this is a no-op.
  */
 static void
 arc_cksum_verify(arc_buf_t *buf)
 {
 #ifdef ZFS_DEBUG
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	zio_cksum_t zc;
 
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 
 	if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
 
 	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
 	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
 		panic("buffer modified while frozen!");
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 #endif
 }
 
 /*
  * This function makes the assumption that data stored in the L2ARC
  * will be transformed exactly as it is in the main pool. Because of
  * this we can verify the checksum against the reading process's bp.
  */
 static boolean_t
 arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
 {
 	ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
 	VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
 
 	/*
 	 * Block pointers always store the checksum for the logical data.
 	 * If the block pointer has the gang bit set, then the checksum
 	 * it represents is for the reconstituted data and not for an
 	 * individual gang member. The zio pipeline, however, must be able to
 	 * determine the checksum of each of the gang constituents so it
 	 * treats the checksum comparison differently than what we need
 	 * for l2arc blocks. This prevents us from using the
 	 * zio_checksum_error() interface directly. Instead we must call the
 	 * zio_checksum_error_impl() so that we can ensure the checksum is
 	 * generated using the correct checksum algorithm and accounts for the
 	 * logical I/O size and not just a gang fragment.
 	 */
 	return (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
 	    BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
 	    zio->io_offset, NULL) == 0);
 }
 
 /*
  * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
  * checksum and attaches it to the buf's hdr so that we can ensure that the buf
  * isn't modified later on. If buf is compressed or there is already a checksum
  * on the hdr, this is a no-op (we only checksum uncompressed bufs).
  */
 static void
 arc_cksum_compute(arc_buf_t *buf)
 {
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 #ifdef ZFS_DEBUG
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 	if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
 
 	ASSERT(!ARC_BUF_ENCRYPTED(buf));
 	ASSERT(!ARC_BUF_COMPRESSED(buf));
 	hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
 	    KM_SLEEP);
 	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
 	    hdr->b_l1hdr.b_freeze_cksum);
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 #endif
 	arc_buf_watch(buf);
 }
 
 #ifndef _KERNEL
 void
 arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
 {
 	(void) sig, (void) unused;
 	panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr);
 }
 #endif
 
 static void
 arc_buf_unwatch(arc_buf_t *buf)
 {
 #ifndef _KERNEL
 	if (arc_watch) {
 		ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
 		    PROT_READ | PROT_WRITE));
 	}
 #else
 	(void) buf;
 #endif
 }
 
 static void
 arc_buf_watch(arc_buf_t *buf)
 {
 #ifndef _KERNEL
 	if (arc_watch)
 		ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
 		    PROT_READ));
 #else
 	(void) buf;
 #endif
 }
 
 static arc_buf_contents_t
 arc_buf_type(arc_buf_hdr_t *hdr)
 {
 	arc_buf_contents_t type;
 	if (HDR_ISTYPE_METADATA(hdr)) {
 		type = ARC_BUFC_METADATA;
 	} else {
 		type = ARC_BUFC_DATA;
 	}
 	VERIFY3U(hdr->b_type, ==, type);
 	return (type);
 }
 
 boolean_t
 arc_is_metadata(arc_buf_t *buf)
 {
 	return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
 }
 
 static uint32_t
 arc_bufc_to_flags(arc_buf_contents_t type)
 {
 	switch (type) {
 	case ARC_BUFC_DATA:
 		/* metadata field is 0 if buffer contains normal data */
 		return (0);
 	case ARC_BUFC_METADATA:
 		return (ARC_FLAG_BUFC_METADATA);
 	default:
 		break;
 	}
 	panic("undefined ARC buffer type!");
 	return ((uint32_t)-1);
 }
 
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 
 	arc_cksum_verify(buf);
 
 	/*
 	 * Compressed buffers do not manipulate the b_freeze_cksum.
 	 */
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	arc_cksum_free(hdr);
 	arc_buf_unwatch(buf);
 }
 
 void
 arc_buf_freeze(arc_buf_t *buf)
 {
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(buf->b_hdr));
 	arc_cksum_compute(buf);
 }
 
 /*
  * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
  * the following functions should be used to ensure that the flags are
  * updated in a thread-safe way. When manipulating the flags either
  * the hash_lock must be held or the hdr must be undiscoverable. This
  * ensures that we're not racing with any other threads when updating
  * the flags.
  */
 static inline void
 arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	hdr->b_flags |= flags;
 }
 
 static inline void
 arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	hdr->b_flags &= ~flags;
 }
 
 /*
  * Setting the compression bits in the arc_buf_hdr_t's b_flags is
  * done in a special way since we have to clear and set bits
  * at the same time. Consumers that wish to set the compression bits
  * must use this function to ensure that the flags are updated in
  * thread-safe manner.
  */
 static void
 arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Holes and embedded blocks will always have a psize = 0 so
 	 * we ignore the compression of the blkptr and set the
 	 * want to uncompress them. Mark them as uncompressed.
 	 */
 	if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
 		arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
 		ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
 		ASSERT(HDR_COMPRESSION_ENABLED(hdr));
 	}
 
 	HDR_SET_COMPRESS(hdr, cmp);
 	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
 }
 
 /*
  * Looks for another buf on the same hdr which has the data decompressed, copies
  * from it, and returns true. If no such buf exists, returns false.
  */
 static boolean_t
 arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	boolean_t copied = B_FALSE;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(!ARC_BUF_COMPRESSED(buf));
 
 	for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
 	    from = from->b_next) {
 		/* can't use our own data buffer */
 		if (from == buf) {
 			continue;
 		}
 
 		if (!ARC_BUF_COMPRESSED(from)) {
 			memcpy(buf->b_data, from->b_data, arc_buf_size(buf));
 			copied = B_TRUE;
 			break;
 		}
 	}
 
 #ifdef ZFS_DEBUG
 	/*
 	 * There were no decompressed bufs, so there should not be a
 	 * checksum on the hdr either.
 	 */
 	if (zfs_flags & ZFS_DEBUG_MODIFY)
 		EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
 #endif
 
 	return (copied);
 }
 
 /*
  * Allocates an ARC buf header that's in an evicted & L2-cached state.
  * This is used during l2arc reconstruction to make empty ARC buffers
  * which circumvent the regular disk->arc->l2arc path and instead come
  * into being in the reverse order, i.e. l2arc->arc.
  */
 static arc_buf_hdr_t *
 arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
     dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth,
     enum zio_compress compress, uint8_t complevel, boolean_t protected,
     boolean_t prefetch, arc_state_type_t arcs_state)
 {
 	arc_buf_hdr_t	*hdr;
 
 	ASSERT(size != 0);
 	hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP);
 	hdr->b_birth = birth;
 	hdr->b_type = type;
 	hdr->b_flags = 0;
 	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR);
 	HDR_SET_LSIZE(hdr, size);
 	HDR_SET_PSIZE(hdr, psize);
 	arc_hdr_set_compress(hdr, compress);
 	hdr->b_complevel = complevel;
 	if (protected)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 	if (prefetch)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
 	hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa);
 
 	hdr->b_dva = dva;
 
 	hdr->b_l2hdr.b_dev = dev;
 	hdr->b_l2hdr.b_daddr = daddr;
 	hdr->b_l2hdr.b_arcs_state = arcs_state;
 
 	return (hdr);
 }
 
 /*
  * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
  */
 static uint64_t
 arc_hdr_size(arc_buf_hdr_t *hdr)
 {
 	uint64_t size;
 
 	if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
 	    HDR_GET_PSIZE(hdr) > 0) {
 		size = HDR_GET_PSIZE(hdr);
 	} else {
 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
 		size = HDR_GET_LSIZE(hdr);
 	}
 	return (size);
 }
 
 static int
 arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
 {
 	int ret;
 	uint64_t csize;
 	uint64_t lsize = HDR_GET_LSIZE(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	void *tmpbuf = NULL;
 	abd_t *abd = hdr->b_l1hdr.b_pabd;
 
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT(HDR_AUTHENTICATED(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	/*
 	 * The MAC is calculated on the compressed data that is stored on disk.
 	 * However, if compressed arc is disabled we will only have the
 	 * decompressed data available to us now. Compress it into a temporary
 	 * abd so we can verify the MAC. The performance overhead of this will
 	 * be relatively low, since most objects in an encrypted objset will
 	 * be encrypted (instead of authenticated) anyway.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 
 		csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, &tmpbuf, lsize, hdr->b_complevel);
 		ASSERT3P(tmpbuf, !=, NULL);
 		ASSERT3U(csize, <=, psize);
 		abd = abd_get_from_buf(tmpbuf, lsize);
 		abd_take_ownership_of_buf(abd, B_TRUE);
 		abd_zero_off(abd, csize, psize - csize);
 	}
 
 	/*
 	 * Authentication is best effort. We authenticate whenever the key is
 	 * available. If we succeed we clear ARC_FLAG_NOAUTH.
 	 */
 	if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) {
 		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
 		ASSERT3U(lsize, ==, psize);
 		ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd,
 		    psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 	} else {
 		ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize,
 		    hdr->b_crypt_hdr.b_mac);
 	}
 
 	if (ret == 0)
 		arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
 	else if (ret != ENOENT)
 		goto error;
 
 	if (tmpbuf != NULL)
 		abd_free(abd);
 
 	return (0);
 
 error:
 	if (tmpbuf != NULL)
 		abd_free(abd);
 
 	return (ret);
 }
 
 /*
  * This function will take a header that only has raw encrypted data in
  * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in
  * b_l1hdr.b_pabd. If designated in the header flags, this function will
  * also decompress the data.
  */
 static int
 arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
 {
 	int ret;
 	abd_t *cabd = NULL;
 	void *tmp = NULL;
 	boolean_t no_crypt = B_FALSE;
 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT(HDR_ENCRYPTED(hdr));
 
 	arc_hdr_alloc_abd(hdr, 0);
 
 	ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
 	    B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
 	    hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd,
 	    hdr->b_crypt_hdr.b_rabd, &no_crypt);
 	if (ret != 0)
 		goto error;
 
 	if (no_crypt) {
 		abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd,
 		    HDR_GET_PSIZE(hdr));
 	}
 
 	/*
 	 * If this header has disabled arc compression but the b_pabd is
 	 * compressed after decrypting it, we need to decompress the newly
 	 * decrypted data.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		/*
 		 * We want to make sure that we are correctly honoring the
 		 * zfs_abd_scatter_enabled setting, so we allocate an abd here
 		 * and then loan a buffer from it, rather than allocating a
 		 * linear buffer and wrapping it in an abd later.
 		 */
 		cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0);
 		tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
 			abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
 			goto error;
 		}
 
 		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
 	}
 
 	return (0);
 
 error:
 	arc_hdr_free_abd(hdr, B_FALSE);
 	if (cabd != NULL)
 		arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr);
 
 	return (ret);
 }
 
 /*
  * This function is called during arc_buf_fill() to prepare the header's
  * abd plaintext pointer for use. This involves authenticated protected
  * data and decrypting encrypted data into the plaintext abd.
  */
 static int
 arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa,
     const zbookmark_phys_t *zb, boolean_t noauth)
 {
 	int ret;
 
 	ASSERT(HDR_PROTECTED(hdr));
 
 	if (hash_lock != NULL)
 		mutex_enter(hash_lock);
 
 	if (HDR_NOAUTH(hdr) && !noauth) {
 		/*
 		 * The caller requested authenticated data but our data has
 		 * not been authenticated yet. Verify the MAC now if we can.
 		 */
 		ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset);
 		if (ret != 0)
 			goto error;
 	} else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) {
 		/*
 		 * If we only have the encrypted version of the data, but the
 		 * unencrypted version was requested we take this opportunity
 		 * to store the decrypted version in the header for future use.
 		 */
 		ret = arc_hdr_decrypt(hdr, spa, zb);
 		if (ret != 0)
 			goto error;
 	}
 
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	if (hash_lock != NULL)
 		mutex_exit(hash_lock);
 
 	return (0);
 
 error:
 	if (hash_lock != NULL)
 		mutex_exit(hash_lock);
 
 	return (ret);
 }
 
 /*
  * This function is used by the dbuf code to decrypt bonus buffers in place.
  * The dbuf code itself doesn't have any locking for decrypting a shared dnode
  * block, so we use the hash lock here to protect against concurrent calls to
  * arc_buf_fill().
  */
 static void
 arc_buf_untransform_in_place(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(HDR_ENCRYPTED(hdr));
 	ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
 	    arc_buf_size(buf));
 	buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 	buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 }
 
 /*
  * Given a buf that has a data buffer attached to it, this function will
  * efficiently fill the buf with data of the specified compression setting from
  * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
  * are already sharing a data buf, no copy is performed.
  *
  * If the buf is marked as compressed but uncompressed data was requested, this
  * will allocate a new data buffer for the buf, remove that flag, and fill the
  * buf with uncompressed data. You can't request a compressed buf on a hdr with
  * uncompressed data, and (since we haven't added support for it yet) if you
  * want compressed data your buf must already be marked as compressed and have
  * the correct-sized data buffer.
  */
 static int
 arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
     arc_fill_flags_t flags)
 {
 	int error = 0;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	boolean_t hdr_compressed =
 	    (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 	boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0;
 	boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0;
 	dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
 	kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr);
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf));
 	IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
 	IMPLY(encrypted, HDR_ENCRYPTED(hdr));
 	IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
 	IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
 	IMPLY(encrypted, !arc_buf_is_shared(buf));
 
 	/*
 	 * If the caller wanted encrypted data we just need to copy it from
 	 * b_rabd and potentially byteswap it. We won't be able to do any
 	 * further transforms on it.
 	 */
 	if (encrypted) {
 		ASSERT(HDR_HAS_RABD(hdr));
 		abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd,
 		    HDR_GET_PSIZE(hdr));
 		goto byteswap;
 	}
 
 	/*
 	 * Adjust encrypted and authenticated headers to accommodate
 	 * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are
 	 * allowed to fail decryption due to keys not being loaded
 	 * without being marked as an IO error.
 	 */
 	if (HDR_PROTECTED(hdr)) {
 		error = arc_fill_hdr_crypt(hdr, hash_lock, spa,
 		    zb, !!(flags & ARC_FILL_NOAUTH));
 		if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) {
 			return (error);
 		} else if (error != 0) {
 			if (hash_lock != NULL)
 				mutex_enter(hash_lock);
 			arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 			return (error);
 		}
 	}
 
 	/*
 	 * There is a special case here for dnode blocks which are
 	 * decrypting their bonus buffers. These blocks may request to
 	 * be decrypted in-place. This is necessary because there may
 	 * be many dnodes pointing into this buffer and there is
 	 * currently no method to synchronize replacing the backing
 	 * b_data buffer and updating all of the pointers. Here we use
 	 * the hash lock to ensure there are no races. If the need
 	 * arises for other types to be decrypted in-place, they must
 	 * add handling here as well.
 	 */
 	if ((flags & ARC_FILL_IN_PLACE) != 0) {
 		ASSERT(!hdr_compressed);
 		ASSERT(!compressed);
 		ASSERT(!encrypted);
 
 		if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) {
 			ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
 
 			if (hash_lock != NULL)
 				mutex_enter(hash_lock);
 			arc_buf_untransform_in_place(buf);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 
 			/* Compute the hdr's checksum if necessary */
 			arc_cksum_compute(buf);
 		}
 
 		return (0);
 	}
 
 	if (hdr_compressed == compressed) {
 		if (ARC_BUF_SHARED(buf)) {
 			ASSERT(arc_buf_is_shared(buf));
 		} else {
 			abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
 			    arc_buf_size(buf));
 		}
 	} else {
 		ASSERT(hdr_compressed);
 		ASSERT(!compressed);
 
 		/*
 		 * If the buf is sharing its data with the hdr, unlink it and
 		 * allocate a new data buffer for the buf.
 		 */
 		if (ARC_BUF_SHARED(buf)) {
 			ASSERT(ARC_BUF_COMPRESSED(buf));
 
 			/* We need to give the buf its own b_data */
 			buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
 			buf->b_data =
 			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 
 			/* Previously overhead was 0; just add new overhead */
 			ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
 		} else if (ARC_BUF_COMPRESSED(buf)) {
 			ASSERT(!arc_buf_is_shared(buf));
 
 			/* We need to reallocate the buf's b_data */
 			arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
 			    buf);
 			buf->b_data =
 			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
 
 			/* We increased the size of b_data; update overhead */
 			ARCSTAT_INCR(arcstat_overhead_size,
 			    HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
 		}
 
 		/*
 		 * Regardless of the buf's previous compression settings, it
 		 * should not be compressed at the end of this function.
 		 */
 		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 
 		/*
 		 * Try copying the data from another buf which already has a
 		 * decompressed version. If that's not possible, it's time to
 		 * bite the bullet and decompress the data from the hdr.
 		 */
 		if (arc_buf_try_copy_decompressed_data(buf)) {
 			/* Skip byteswapping and checksumming (already done) */
 			return (0);
 		} else {
 			error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 			    hdr->b_l1hdr.b_pabd, buf->b_data,
 			    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
 			    &hdr->b_complevel);
 
 			/*
 			 * Absent hardware errors or software bugs, this should
 			 * be impossible, but log it anyway so we can debug it.
 			 */
 			if (error != 0) {
 				zfs_dbgmsg(
 				    "hdr %px, compress %d, psize %d, lsize %d",
 				    hdr, arc_hdr_get_compress(hdr),
 				    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
 				if (hash_lock != NULL)
 					mutex_enter(hash_lock);
 				arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 				if (hash_lock != NULL)
 					mutex_exit(hash_lock);
 				return (SET_ERROR(EIO));
 			}
 		}
 	}
 
 byteswap:
 	/* Byteswap the buf's data if necessary */
 	if (bswap != DMU_BSWAP_NUMFUNCS) {
 		ASSERT(!HDR_SHARED_DATA(hdr));
 		ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
 		dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
 	}
 
 	/* Compute the hdr's checksum if necessary */
 	arc_cksum_compute(buf);
 
 	return (0);
 }
 
 /*
  * If this function is being called to decrypt an encrypted buffer or verify an
  * authenticated one, the key must be loaded and a mapping must be made
  * available in the keystore via spa_keystore_create_mapping() or one of its
  * callers.
  */
 int
 arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
     boolean_t in_place)
 {
 	int ret;
 	arc_fill_flags_t flags = 0;
 
 	if (in_place)
 		flags |= ARC_FILL_IN_PLACE;
 
 	ret = arc_buf_fill(buf, spa, zb, flags);
 	if (ret == ECKSUM) {
 		/*
 		 * Convert authentication and decryption errors to EIO
 		 * (and generate an ereport) before leaving the ARC.
 		 */
 		ret = SET_ERROR(EIO);
-		spa_log_error(spa, zb, &buf->b_hdr->b_birth);
+		spa_log_error(spa, zb, buf->b_hdr->b_birth);
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 		    spa, NULL, zb, NULL, 0);
 	}
 
 	return (ret);
 }
 
 /*
  * Increment the amount of evictable space in the arc_state_t's refcount.
  * We account for the space used by the hdr and the arc buf individually
  * so that we can add and remove them from the refcount individually.
  */
 static void
 arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	if (GHOST_STATE(state)) {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    HDR_GET_LSIZE(hdr), hdr);
 		return;
 	}
 
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    arc_hdr_size(hdr), hdr);
 	}
 	if (HDR_HAS_RABD(hdr)) {
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    HDR_GET_PSIZE(hdr), hdr);
 	}
 
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
 		if (ARC_BUF_SHARED(buf))
 			continue;
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
 	}
 }
 
 /*
  * Decrement the amount of evictable space in the arc_state_t's refcount.
  * We account for the space used by the hdr and the arc buf individually
  * so that we can add and remove them from the refcount individually.
  */
 static void
 arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	if (GHOST_STATE(state)) {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    HDR_GET_LSIZE(hdr), hdr);
 		return;
 	}
 
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    arc_hdr_size(hdr), hdr);
 	}
 	if (HDR_HAS_RABD(hdr)) {
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    HDR_GET_PSIZE(hdr), hdr);
 	}
 
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
 		if (ARC_BUF_SHARED(buf))
 			continue;
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
 	}
 }
 
 /*
  * Add a reference to this hdr indicating that someone is actively
  * referencing that memory. When the refcount transitions from 0 to 1,
  * we remove it from the respective arc_state_t list to indicate that
  * it is not evictable.
  */
 static void
 add_reference(arc_buf_hdr_t *hdr, const void *tag)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
 		ASSERT(state == arc_anon);
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 	}
 
 	if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
 	    state != arc_anon && state != arc_l2c_only) {
 		/* We don't use the L2-only state list. */
 		multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr);
 		arc_evictable_space_decrement(hdr, state);
 	}
 }
 
 /*
  * Remove a reference from this hdr. When the reference transitions from
  * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
  * list making it eligible for eviction.
  */
 static int
 remove_reference(arc_buf_hdr_t *hdr, const void *tag)
 {
 	int cnt;
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(state == arc_anon || MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT(!GHOST_STATE(state));	/* arc_l2c_only counts as a ghost. */
 
 	if ((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) != 0)
 		return (cnt);
 
 	if (state == arc_anon) {
 		arc_hdr_destroy(hdr);
 		return (0);
 	}
 	if (state == arc_uncached && !HDR_PREFETCH(hdr)) {
 		arc_change_state(arc_anon, hdr);
 		arc_hdr_destroy(hdr);
 		return (0);
 	}
 	multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr);
 	arc_evictable_space_increment(hdr, state);
 	return (0);
 }
 
 /*
  * Returns detailed information about a specific arc buffer.  When the
  * state_index argument is set the function will calculate the arc header
  * list position for its arc state.  Since this requires a linear traversal
  * callers are strongly encourage not to do this.  However, it can be helpful
  * for targeted analysis so the functionality is provided.
  */
 void
 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
 {
 	(void) state_index;
 	arc_buf_hdr_t *hdr = ab->b_hdr;
 	l1arc_buf_hdr_t *l1hdr = NULL;
 	l2arc_buf_hdr_t *l2hdr = NULL;
 	arc_state_t *state = NULL;
 
 	memset(abi, 0, sizeof (arc_buf_info_t));
 
 	if (hdr == NULL)
 		return;
 
 	abi->abi_flags = hdr->b_flags;
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		l1hdr = &hdr->b_l1hdr;
 		state = l1hdr->b_state;
 	}
 	if (HDR_HAS_L2HDR(hdr))
 		l2hdr = &hdr->b_l2hdr;
 
 	if (l1hdr) {
 		abi->abi_bufcnt = 0;
 		for (arc_buf_t *buf = l1hdr->b_buf; buf; buf = buf->b_next)
 			abi->abi_bufcnt++;
 		abi->abi_access = l1hdr->b_arc_access;
 		abi->abi_mru_hits = l1hdr->b_mru_hits;
 		abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
 		abi->abi_mfu_hits = l1hdr->b_mfu_hits;
 		abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
 		abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt);
 	}
 
 	if (l2hdr) {
 		abi->abi_l2arc_dattr = l2hdr->b_daddr;
 		abi->abi_l2arc_hits = l2hdr->b_hits;
 	}
 
 	abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
 	abi->abi_state_contents = arc_buf_type(hdr);
 	abi->abi_size = arc_hdr_size(hdr);
 }
 
 /*
  * Move the supplied buffer to the indicated state. The hash lock
  * for the buffer must be held by the caller.
  */
 static void
 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 {
 	arc_state_t *old_state;
 	int64_t refcnt;
 	boolean_t update_old, update_new;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	/*
 	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
 	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
 	 * L1 hdr doesn't always exist when we change state to arc_anon before
 	 * destroying a header, in which case reallocating to add the L1 hdr is
 	 * pointless.
 	 */
 	if (HDR_HAS_L1HDR(hdr)) {
 		old_state = hdr->b_l1hdr.b_state;
 		refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
 		update_old = (hdr->b_l1hdr.b_buf != NULL ||
 		    hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 
 		IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL);
 		IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL);
 		IMPLY(old_state == arc_anon, hdr->b_l1hdr.b_buf == NULL ||
 		    ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
 	} else {
 		old_state = arc_l2c_only;
 		refcnt = 0;
 		update_old = B_FALSE;
 	}
 	update_new = update_old;
 	if (GHOST_STATE(old_state))
 		update_old = B_TRUE;
 	if (GHOST_STATE(new_state))
 		update_new = B_TRUE;
 
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT3P(new_state, !=, old_state);
 
 	/*
 	 * If this buffer is evictable, transfer it from the
 	 * old state list to the new state list.
 	 */
 	if (refcnt == 0) {
 		if (old_state != arc_anon && old_state != arc_l2c_only) {
 			ASSERT(HDR_HAS_L1HDR(hdr));
 			/* remove_reference() saves on insert. */
 			if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 				multilist_remove(&old_state->arcs_list[type],
 				    hdr);
 				arc_evictable_space_decrement(hdr, old_state);
 			}
 		}
 		if (new_state != arc_anon && new_state != arc_l2c_only) {
 			/*
 			 * An L1 header always exists here, since if we're
 			 * moving to some L1-cached state (i.e. not l2c_only or
 			 * anonymous), we realloc the header to add an L1hdr
 			 * beforehand.
 			 */
 			ASSERT(HDR_HAS_L1HDR(hdr));
 			multilist_insert(&new_state->arcs_list[type], hdr);
 			arc_evictable_space_increment(hdr, new_state);
 		}
 	}
 
 	ASSERT(!HDR_EMPTY(hdr));
 	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
 		buf_hash_remove(hdr);
 
 	/* adjust state sizes (ignore arc_l2c_only) */
 
 	if (update_new && new_state != arc_l2c_only) {
 		ASSERT(HDR_HAS_L1HDR(hdr));
 		if (GHOST_STATE(new_state)) {
 
 			/*
 			 * When moving a header to a ghost state, we first
 			 * remove all arc buffers. Thus, we'll have no arc
 			 * buffer to use for the reference. As a result, we
 			 * use the arc header pointer for the reference.
 			 */
 			(void) zfs_refcount_add_many(
 			    &new_state->arcs_size[type],
 			    HDR_GET_LSIZE(hdr), hdr);
 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 		} else {
 
 			/*
 			 * Each individual buffer holds a unique reference,
 			 * thus we must remove each of these references one
 			 * at a time.
 			 */
 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 			    buf = buf->b_next) {
 
 				/*
 				 * When the arc_buf_t is sharing the data
 				 * block with the hdr, the owner of the
 				 * reference belongs to the hdr. Only
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
 				if (ARC_BUF_SHARED(buf))
 					continue;
 
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size[type],
 				    arc_buf_size(buf), buf);
 			}
 
 			if (hdr->b_l1hdr.b_pabd != NULL) {
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size[type],
 				    arc_hdr_size(hdr), hdr);
 			}
 
 			if (HDR_HAS_RABD(hdr)) {
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size[type],
 				    HDR_GET_PSIZE(hdr), hdr);
 			}
 		}
 	}
 
 	if (update_old && old_state != arc_l2c_only) {
 		ASSERT(HDR_HAS_L1HDR(hdr));
 		if (GHOST_STATE(old_state)) {
 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 
 			/*
 			 * When moving a header off of a ghost state,
 			 * the header will not contain any arc buffers.
 			 * We use the arc header pointer for the reference
 			 * which is exactly what we did when we put the
 			 * header on the ghost state.
 			 */
 
 			(void) zfs_refcount_remove_many(
 			    &old_state->arcs_size[type],
 			    HDR_GET_LSIZE(hdr), hdr);
 		} else {
 
 			/*
 			 * Each individual buffer holds a unique reference,
 			 * thus we must remove each of these references one
 			 * at a time.
 			 */
 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 			    buf = buf->b_next) {
 
 				/*
 				 * When the arc_buf_t is sharing the data
 				 * block with the hdr, the owner of the
 				 * reference belongs to the hdr. Only
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
 				if (ARC_BUF_SHARED(buf))
 					continue;
 
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size[type],
 				    arc_buf_size(buf), buf);
 			}
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
 			    HDR_HAS_RABD(hdr));
 
 			if (hdr->b_l1hdr.b_pabd != NULL) {
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size[type],
 				    arc_hdr_size(hdr), hdr);
 			}
 
 			if (HDR_HAS_RABD(hdr)) {
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size[type],
 				    HDR_GET_PSIZE(hdr), hdr);
 			}
 		}
 	}
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		hdr->b_l1hdr.b_state = new_state;
 
 		if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) {
 			l2arc_hdr_arcstats_decrement_state(hdr);
 			hdr->b_l2hdr.b_arcs_state = new_state->arcs_state;
 			l2arc_hdr_arcstats_increment_state(hdr);
 		}
 	}
 }
 
 void
 arc_space_consume(uint64_t space, arc_space_type_t type)
 {
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
 	default:
 		break;
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, space);
 		break;
 	case ARC_SPACE_META:
 		ARCSTAT_INCR(arcstat_metadata_size, space);
 		break;
 	case ARC_SPACE_BONUS:
 		ARCSTAT_INCR(arcstat_bonus_size, space);
 		break;
 	case ARC_SPACE_DNODE:
 		ARCSTAT_INCR(arcstat_dnode_size, space);
 		break;
 	case ARC_SPACE_DBUF:
 		ARCSTAT_INCR(arcstat_dbuf_size, space);
 		break;
 	case ARC_SPACE_HDRS:
 		ARCSTAT_INCR(arcstat_hdr_size, space);
 		break;
 	case ARC_SPACE_L2HDRS:
 		aggsum_add(&arc_sums.arcstat_l2_hdr_size, space);
 		break;
 	case ARC_SPACE_ABD_CHUNK_WASTE:
 		/*
 		 * Note: this includes space wasted by all scatter ABD's, not
 		 * just those allocated by the ARC.  But the vast majority of
 		 * scatter ABD's come from the ARC, because other users are
 		 * very short-lived.
 		 */
 		ARCSTAT_INCR(arcstat_abd_chunk_waste_size, space);
 		break;
 	}
 
 	if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
 		ARCSTAT_INCR(arcstat_meta_used, space);
 
 	aggsum_add(&arc_sums.arcstat_size, space);
 }
 
 void
 arc_space_return(uint64_t space, arc_space_type_t type)
 {
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
 	default:
 		break;
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, -space);
 		break;
 	case ARC_SPACE_META:
 		ARCSTAT_INCR(arcstat_metadata_size, -space);
 		break;
 	case ARC_SPACE_BONUS:
 		ARCSTAT_INCR(arcstat_bonus_size, -space);
 		break;
 	case ARC_SPACE_DNODE:
 		ARCSTAT_INCR(arcstat_dnode_size, -space);
 		break;
 	case ARC_SPACE_DBUF:
 		ARCSTAT_INCR(arcstat_dbuf_size, -space);
 		break;
 	case ARC_SPACE_HDRS:
 		ARCSTAT_INCR(arcstat_hdr_size, -space);
 		break;
 	case ARC_SPACE_L2HDRS:
 		aggsum_add(&arc_sums.arcstat_l2_hdr_size, -space);
 		break;
 	case ARC_SPACE_ABD_CHUNK_WASTE:
 		ARCSTAT_INCR(arcstat_abd_chunk_waste_size, -space);
 		break;
 	}
 
 	if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
 		ARCSTAT_INCR(arcstat_meta_used, -space);
 
 	ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0);
 	aggsum_add(&arc_sums.arcstat_size, -space);
 }
 
 /*
  * Given a hdr and a buf, returns whether that buf can share its b_data buffer
  * with the hdr's b_pabd.
  */
 static boolean_t
 arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	/*
 	 * The criteria for sharing a hdr's data are:
 	 * 1. the buffer is not encrypted
 	 * 2. the hdr's compression matches the buf's compression
 	 * 3. the hdr doesn't need to be byteswapped
 	 * 4. the hdr isn't already being shared
 	 * 5. the buf is either compressed or it is the last buf in the hdr list
 	 *
 	 * Criterion #5 maintains the invariant that shared uncompressed
 	 * bufs must be the final buf in the hdr's b_buf list. Reading this, you
 	 * might ask, "if a compressed buf is allocated first, won't that be the
 	 * last thing in the list?", but in that case it's impossible to create
 	 * a shared uncompressed buf anyway (because the hdr must be compressed
 	 * to have the compressed buf). You might also think that #3 is
 	 * sufficient to make this guarantee, however it's possible
 	 * (specifically in the rare L2ARC write race mentioned in
 	 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
 	 * is shareable, but wasn't at the time of its allocation. Rather than
 	 * allow a new shared uncompressed buf to be created and then shuffle
 	 * the list around to make it the last element, this simply disallows
 	 * sharing if the new buf isn't the first to be added.
 	 */
 	ASSERT3P(buf->b_hdr, ==, hdr);
 	boolean_t hdr_compressed =
 	    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF;
 	boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
 	return (!ARC_BUF_ENCRYPTED(buf) &&
 	    buf_compressed == hdr_compressed &&
 	    hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
 	    !HDR_SHARED_DATA(hdr) &&
 	    (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
 }
 
 /*
  * Allocate a buf for this hdr. If you care about the data that's in the hdr,
  * or if you want a compressed buffer, pass those flags in. Returns 0 if the
  * copy was made successfully, or an error code otherwise.
  */
 static int
 arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
     const void *tag, boolean_t encrypted, boolean_t compressed,
     boolean_t noauth, boolean_t fill, arc_buf_t **ret)
 {
 	arc_buf_t *buf;
 	arc_fill_flags_t flags = ARC_FILL_LOCKED;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
 	VERIFY(hdr->b_type == ARC_BUFC_DATA ||
 	    hdr->b_type == ARC_BUFC_METADATA);
 	ASSERT3P(ret, !=, NULL);
 	ASSERT3P(*ret, ==, NULL);
 	IMPLY(encrypted, compressed);
 
 	buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 	buf->b_hdr = hdr;
 	buf->b_data = NULL;
 	buf->b_next = hdr->b_l1hdr.b_buf;
 	buf->b_flags = 0;
 
 	add_reference(hdr, tag);
 
 	/*
 	 * We're about to change the hdr's b_flags. We must either
 	 * hold the hash_lock or be undiscoverable.
 	 */
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Only honor requests for compressed bufs if the hdr is actually
 	 * compressed. This must be overridden if the buffer is encrypted since
 	 * encrypted buffers cannot be decompressed.
 	 */
 	if (encrypted) {
 		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
 		buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED;
 		flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED;
 	} else if (compressed &&
 	    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
 		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
 		flags |= ARC_FILL_COMPRESSED;
 	}
 
 	if (noauth) {
 		ASSERT0(encrypted);
 		flags |= ARC_FILL_NOAUTH;
 	}
 
 	/*
 	 * If the hdr's data can be shared then we share the data buffer and
 	 * set the appropriate bit in the hdr's b_flags to indicate the hdr is
 	 * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
 	 * buffer to store the buf's data.
 	 *
 	 * There are two additional restrictions here because we're sharing
 	 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
 	 * actively involved in an L2ARC write, because if this buf is used by
 	 * an arc_write() then the hdr's data buffer will be released when the
 	 * write completes, even though the L2ARC write might still be using it.
 	 * Second, the hdr's ABD must be linear so that the buf's user doesn't
 	 * need to be ABD-aware.  It must be allocated via
 	 * zio_[data_]buf_alloc(), not as a page, because we need to be able
 	 * to abd_release_ownership_of_buf(), which isn't allowed on "linear
 	 * page" buffers because the ABD code needs to handle freeing them
 	 * specially.
 	 */
 	boolean_t can_share = arc_can_share(hdr, buf) &&
 	    !HDR_L2_WRITING(hdr) &&
 	    hdr->b_l1hdr.b_pabd != NULL &&
 	    abd_is_linear(hdr->b_l1hdr.b_pabd) &&
 	    !abd_is_linear_page(hdr->b_l1hdr.b_pabd);
 
 	/* Set up b_data and sharing */
 	if (can_share) {
 		buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
 		buf->b_flags |= ARC_BUF_FLAG_SHARED;
 		arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
 	} else {
 		buf->b_data =
 		    arc_get_data_buf(hdr, arc_buf_size(buf), buf);
 		ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
 	}
 	VERIFY3P(buf->b_data, !=, NULL);
 
 	hdr->b_l1hdr.b_buf = buf;
 
 	/*
 	 * If the user wants the data from the hdr, we need to either copy or
 	 * decompress the data.
 	 */
 	if (fill) {
 		ASSERT3P(zb, !=, NULL);
 		return (arc_buf_fill(buf, spa, zb, flags));
 	}
 
 	return (0);
 }
 
 static const char *arc_onloan_tag = "onloan";
 
 static inline void
 arc_loaned_bytes_update(int64_t delta)
 {
 	atomic_add_64(&arc_loaned_bytes, delta);
 
 	/* assert that it did not wrap around */
 	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
 }
 
 /*
  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
  * flight data by arc_tempreserve_space() until they are "returned". Loaned
  * buffers must be returned to the arc before they can be used by the DMU or
  * freed.
  */
 arc_buf_t *
 arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
 {
 	arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
 	    is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
 	    psize, lsize, compression_type, complevel);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
     dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj,
 	    byteorder, salt, iv, mac, ot, psize, lsize, compression_type,
 	    complevel);
 
 	atomic_add_64(&arc_loaned_bytes, psize);
 	return (buf);
 }
 
 
 /*
  * Return a loaned arc buffer to the arc.
  */
 void
 arc_return_buf(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
 	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
 
 	arc_loaned_bytes_update(-arc_buf_size(buf));
 }
 
 /* Detach an arc_buf from a dbuf (tag) */
 void
 arc_loan_inuse_buf(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
 	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 }
 
 static void
 l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
 {
 	l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
 
 	df->l2df_abd = abd;
 	df->l2df_size = size;
 	df->l2df_type = type;
 	mutex_enter(&l2arc_free_on_write_mtx);
 	list_insert_head(l2arc_free_on_write, df);
 	mutex_exit(&l2arc_free_on_write_mtx);
 }
 
 static void
 arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
 
 	/* protected by hash lock, if in the hash table */
 	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT(state != arc_anon && state != arc_l2c_only);
 
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    size, hdr);
 	}
 	(void) zfs_refcount_remove_many(&state->arcs_size[type], size, hdr);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_return(size, ARC_SPACE_META);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		arc_space_return(size, ARC_SPACE_DATA);
 	}
 
 	if (free_rdata) {
 		l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type);
 	} else {
 		l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
 	}
 }
 
 /*
  * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
  * data buffer, we transfer the refcount ownership to the hdr and update
  * the appropriate kstats.
  */
 static void
 arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(arc_can_share(hdr, buf));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 	ASSERT(!ARC_BUF_ENCRYPTED(buf));
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Start sharing the data buffer. We transfer the
 	 * refcount ownership to the hdr since it always owns
 	 * the refcount whenever an arc_buf_t is shared.
 	 */
 	zfs_refcount_transfer_ownership_many(
 	    &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)],
 	    arc_hdr_size(hdr), buf, hdr);
 	hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
 	abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
 	    HDR_ISTYPE_METADATA(hdr));
 	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
 	buf->b_flags |= ARC_BUF_FLAG_SHARED;
 
 	/*
 	 * Since we've transferred ownership to the hdr we need
 	 * to increment its compressed and uncompressed kstats and
 	 * decrement the overhead size.
 	 */
 	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
 	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
 	ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
 }
 
 static void
 arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(arc_buf_is_shared(buf));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * We are no longer sharing this buffer so we need
 	 * to transfer its ownership to the rightful owner.
 	 */
 	zfs_refcount_transfer_ownership_many(
 	    &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)],
 	    arc_hdr_size(hdr), hdr, buf);
 	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 	abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
 	abd_free(hdr->b_l1hdr.b_pabd);
 	hdr->b_l1hdr.b_pabd = NULL;
 	buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
 
 	/*
 	 * Since the buffer is no longer shared between
 	 * the arc buf and the hdr, count it as overhead.
 	 */
 	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
 	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
 	ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
 }
 
 /*
  * Remove an arc_buf_t from the hdr's buf list and return the last
  * arc_buf_t on the list. If no buffers remain on the list then return
  * NULL.
  */
 static arc_buf_t *
 arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
 	arc_buf_t *lastbuf = NULL;
 
 	/*
 	 * Remove the buf from the hdr list and locate the last
 	 * remaining buffer on the list.
 	 */
 	while (*bufp != NULL) {
 		if (*bufp == buf)
 			*bufp = buf->b_next;
 
 		/*
 		 * If we've removed a buffer in the middle of
 		 * the list then update the lastbuf and update
 		 * bufp.
 		 */
 		if (*bufp != NULL) {
 			lastbuf = *bufp;
 			bufp = &(*bufp)->b_next;
 		}
 	}
 	buf->b_next = NULL;
 	ASSERT3P(lastbuf, !=, buf);
 	IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
 
 	return (lastbuf);
 }
 
 /*
  * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's
  * list and free it.
  */
 static void
 arc_buf_destroy_impl(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * Free up the data associated with the buf but only if we're not
 	 * sharing this with the hdr. If we are sharing it with the hdr, the
 	 * hdr is responsible for doing the free.
 	 */
 	if (buf->b_data != NULL) {
 		/*
 		 * We're about to change the hdr's b_flags. We must either
 		 * hold the hash_lock or be undiscoverable.
 		 */
 		ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
 
 		if (ARC_BUF_SHARED(buf)) {
 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 		} else {
 			ASSERT(!arc_buf_is_shared(buf));
 			uint64_t size = arc_buf_size(buf);
 			arc_free_data_buf(hdr, buf->b_data, size, buf);
 			ARCSTAT_INCR(arcstat_overhead_size, -size);
 		}
 		buf->b_data = NULL;
 
 		/*
 		 * If we have no more encrypted buffers and we've already
 		 * gotten a copy of the decrypted data we can free b_rabd
 		 * to save some space.
 		 */
 		if (ARC_BUF_ENCRYPTED(buf) && HDR_HAS_RABD(hdr) &&
 		    hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) {
 			arc_buf_t *b;
 			for (b = hdr->b_l1hdr.b_buf; b; b = b->b_next) {
 				if (b != buf && ARC_BUF_ENCRYPTED(b))
 					break;
 			}
 			if (b == NULL)
 				arc_hdr_free_abd(hdr, B_TRUE);
 		}
 	}
 
 	arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
 
 	if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
 		/*
 		 * If the current arc_buf_t is sharing its data buffer with the
 		 * hdr, then reassign the hdr's b_pabd to share it with the new
 		 * buffer at the end of the list. The shared buffer is always
 		 * the last one on the hdr's buffer list.
 		 *
 		 * There is an equivalent case for compressed bufs, but since
 		 * they aren't guaranteed to be the last buf in the list and
 		 * that is an exceedingly rare case, we just allow that space be
 		 * wasted temporarily. We must also be careful not to share
 		 * encrypted buffers, since they cannot be shared.
 		 */
 		if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
 			/* Only one buf can be shared at once */
 			ASSERT(!arc_buf_is_shared(lastbuf));
 			/* hdr is uncompressed so can't have compressed buf */
 			ASSERT(!ARC_BUF_COMPRESSED(lastbuf));
 
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			arc_hdr_free_abd(hdr, B_FALSE);
 
 			/*
 			 * We must setup a new shared block between the
 			 * last buffer and the hdr. The data would have
 			 * been allocated by the arc buf so we need to transfer
 			 * ownership to the hdr since it's now being shared.
 			 */
 			arc_share_buf(hdr, lastbuf);
 		}
 	} else if (HDR_SHARED_DATA(hdr)) {
 		/*
 		 * Uncompressed shared buffers are always at the end
 		 * of the list. Compressed buffers don't have the
 		 * same requirements. This makes it hard to
 		 * simply assert that the lastbuf is shared so
 		 * we rely on the hdr's compression flags to determine
 		 * if we have a compressed, shared buffer.
 		 */
 		ASSERT3P(lastbuf, !=, NULL);
 		ASSERT(arc_buf_is_shared(lastbuf) ||
 		    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 	}
 
 	/*
 	 * Free the checksum if we're removing the last uncompressed buf from
 	 * this hdr.
 	 */
 	if (!arc_hdr_has_uncompressed_buf(hdr)) {
 		arc_cksum_free(hdr);
 	}
 
 	/* clean up the buf */
 	buf->b_hdr = NULL;
 	kmem_cache_free(buf_cache, buf);
 }
 
 static void
 arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags)
 {
 	uint64_t size;
 	boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0);
 
 	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata);
 	IMPLY(alloc_rdata, HDR_PROTECTED(hdr));
 
 	if (alloc_rdata) {
 		size = HDR_GET_PSIZE(hdr);
 		ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL);
 		hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr,
 		    alloc_flags);
 		ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
 		ARCSTAT_INCR(arcstat_raw_size, size);
 	} else {
 		size = arc_hdr_size(hdr);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr,
 		    alloc_flags);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 	}
 
 	ARCSTAT_INCR(arcstat_compressed_size, size);
 	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
 }
 
 static void
 arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
 {
 	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 	IMPLY(free_rdata, HDR_HAS_RABD(hdr));
 
 	/*
 	 * If the hdr is currently being written to the l2arc then
 	 * we defer freeing the data by adding it to the l2arc_free_on_write
 	 * list. The l2arc will free the data once it's finished
 	 * writing it to the l2arc device.
 	 */
 	if (HDR_L2_WRITING(hdr)) {
 		arc_hdr_free_on_write(hdr, free_rdata);
 		ARCSTAT_BUMP(arcstat_l2_free_on_write);
 	} else if (free_rdata) {
 		arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr);
 	} else {
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr);
 	}
 
 	if (free_rdata) {
 		hdr->b_crypt_hdr.b_rabd = NULL;
 		ARCSTAT_INCR(arcstat_raw_size, -size);
 	} else {
 		hdr->b_l1hdr.b_pabd = NULL;
 	}
 
 	if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
 		hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 
 	ARCSTAT_INCR(arcstat_compressed_size, -size);
 	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
 }
 
 /*
  * Allocate empty anonymous ARC header.  The header will get its identity
  * assigned and buffers attached later as part of read or write operations.
  *
  * In case of read arc_read() assigns header its identify (b_dva + b_birth),
  * inserts it into ARC hash to become globally visible and allocates physical
  * (b_pabd) or raw (b_rabd) ABD buffer to read into from disk.  On disk read
  * completion arc_read_done() allocates ARC buffer(s) as needed, potentially
  * sharing one of them with the physical ABD buffer.
  *
  * In case of write arc_alloc_buf() allocates ARC buffer to be filled with
  * data.  Then after compression and/or encryption arc_write_ready() allocates
  * and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD
  * buffer.  On disk write completion arc_write_done() assigns the header its
  * new identity (b_dva + b_birth) and inserts into ARC hash.
  *
  * In case of partial overwrite the old data is read first as described. Then
  * arc_release() either allocates new anonymous ARC header and moves the ARC
  * buffer to it, or reuses the old ARC header by discarding its identity and
  * removing it from ARC hash.  After buffer modification normal write process
  * follows as described.
  */
 static arc_buf_hdr_t *
 arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
     boolean_t protected, enum zio_compress compression_type, uint8_t complevel,
     arc_buf_contents_t type)
 {
 	arc_buf_hdr_t *hdr;
 
 	VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
 	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
 
 	ASSERT(HDR_EMPTY(hdr));
 #ifdef ZFS_DEBUG
 	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 #endif
 	HDR_SET_PSIZE(hdr, psize);
 	HDR_SET_LSIZE(hdr, lsize);
 	hdr->b_spa = spa;
 	hdr->b_type = type;
 	hdr->b_flags = 0;
 	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
 	arc_hdr_set_compress(hdr, compression_type);
 	hdr->b_complevel = complevel;
 	if (protected)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 
 	hdr->b_l1hdr.b_state = arc_anon;
 	hdr->b_l1hdr.b_arc_access = 0;
 	hdr->b_l1hdr.b_mru_hits = 0;
 	hdr->b_l1hdr.b_mru_ghost_hits = 0;
 	hdr->b_l1hdr.b_mfu_hits = 0;
 	hdr->b_l1hdr.b_mfu_ghost_hits = 0;
 	hdr->b_l1hdr.b_buf = NULL;
 
 	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 
 	return (hdr);
 }
 
 /*
  * Transition between the two allocation states for the arc_buf_hdr struct.
  * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
  * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
  * version is used when a cache buffer is only in the L2ARC in order to reduce
  * memory usage.
  */
 static arc_buf_hdr_t *
 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
 {
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	arc_buf_hdr_t *nhdr;
 	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
 
 	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
 	    (old == hdr_l2only_cache && new == hdr_full_cache));
 
 	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
 
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	buf_hash_remove(hdr);
 
 	memcpy(nhdr, hdr, HDR_L2ONLY_SIZE);
 
 	if (new == hdr_full_cache) {
 		arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
 		/*
 		 * arc_access and arc_change_state need to be aware that a
 		 * header has just come out of L2ARC, so we set its state to
 		 * l2c_only even though it's about to change.
 		 */
 		nhdr->b_l1hdr.b_state = arc_l2c_only;
 
 		/* Verify previous threads set to NULL before freeing */
 		ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 	} else {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 #ifdef ZFS_DEBUG
 		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 #endif
 
 		/*
 		 * If we've reached here, We must have been called from
 		 * arc_evict_hdr(), as such we should have already been
 		 * removed from any ghost list we were previously on
 		 * (which protects us from racing with arc_evict_state),
 		 * thus no locking is needed during this check.
 		 */
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 
 		/*
 		 * A buffer must not be moved into the arc_l2c_only
 		 * state if it's not finished being written out to the
 		 * l2arc device. Otherwise, the b_l1hdr.b_pabd field
 		 * might try to be accessed, even though it was removed.
 		 */
 		VERIFY(!HDR_L2_WRITING(hdr));
 		VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 
 		arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
 	}
 	/*
 	 * The header has been reallocated so we need to re-insert it into any
 	 * lists it was on.
 	 */
 	(void) buf_hash_insert(nhdr, NULL);
 
 	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
 
 	mutex_enter(&dev->l2ad_mtx);
 
 	/*
 	 * We must place the realloc'ed header back into the list at
 	 * the same spot. Otherwise, if it's placed earlier in the list,
 	 * l2arc_write_buffers() could find it during the function's
 	 * write phase, and try to write it out to the l2arc.
 	 */
 	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
 	list_remove(&dev->l2ad_buflist, hdr);
 
 	mutex_exit(&dev->l2ad_mtx);
 
 	/*
 	 * Since we're using the pointer address as the tag when
 	 * incrementing and decrementing the l2ad_alloc refcount, we
 	 * must remove the old pointer (that we're about to destroy) and
 	 * add the new pointer to the refcount. Otherwise we'd remove
 	 * the wrong pointer address when calling arc_hdr_destroy() later.
 	 */
 
 	(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
 	    arc_hdr_size(hdr), hdr);
 	(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 	    arc_hdr_size(nhdr), nhdr);
 
 	buf_discard_identity(hdr);
 	kmem_cache_free(old, hdr);
 
 	return (nhdr);
 }
 
 /*
  * This function is used by the send / receive code to convert a newly
  * allocated arc_buf_t to one that is suitable for a raw encrypted write. It
  * is also used to allow the root objset block to be updated without altering
  * its embedded MACs. Both block types will always be uncompressed so we do not
  * have to worry about compression type or psize.
  */
 void
 arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
     dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv,
     const uint8_t *mac)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 
 	buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
 	arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 	hdr->b_crypt_hdr.b_dsobj = dsobj;
 	hdr->b_crypt_hdr.b_ot = ot;
 	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
 	    DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
 	if (!arc_hdr_has_uncompressed_buf(hdr))
 		arc_cksum_free(hdr);
 
 	if (salt != NULL)
 		memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
 	if (iv != NULL)
 		memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
 	if (mac != NULL)
 		memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
 }
 
 /*
  * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
  * The buf is returned thawed since we expect the consumer to modify it.
  */
 arc_buf_t *
 arc_alloc_buf(spa_t *spa, const void *tag, arc_buf_contents_t type,
     int32_t size)
 {
 	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
 	    B_FALSE, ZIO_COMPRESS_OFF, 0, type);
 
 	arc_buf_t *buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE,
 	    B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 
 	return (buf);
 }
 
 /*
  * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
  * for bufs containing metadata.
  */
 arc_buf_t *
 arc_alloc_compressed_buf(spa_t *spa, const void *tag, uint64_t psize,
     uint64_t lsize, enum zio_compress compression_type, uint8_t complevel)
 {
 	ASSERT3U(lsize, >, 0);
 	ASSERT3U(lsize, >=, psize);
 	ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF);
 	ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
 
 	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
 	    B_FALSE, compression_type, complevel, ARC_BUFC_DATA);
 
 	arc_buf_t *buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
 	    B_TRUE, B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 
 	/*
 	 * To ensure that the hdr has the correct data in it if we call
 	 * arc_untransform() on this buf before it's been written to disk,
 	 * it's easiest if we just set up sharing between the buf and the hdr.
 	 */
 	arc_share_buf(hdr, buf);
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_alloc_raw_buf(spa_t *spa, const void *tag, uint64_t dsobj,
     boolean_t byteorder, const uint8_t *salt, const uint8_t *iv,
     const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf;
 	arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ?
 	    ARC_BUFC_METADATA : ARC_BUFC_DATA;
 
 	ASSERT3U(lsize, >, 0);
 	ASSERT3U(lsize, >=, psize);
 	ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF);
 	ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
 
 	hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
 	    compression_type, complevel, type);
 
 	hdr->b_crypt_hdr.b_dsobj = dsobj;
 	hdr->b_crypt_hdr.b_ot = ot;
 	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
 	    DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
 	memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
 	memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
 	memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
 
 	/*
 	 * This buffer will be considered encrypted even if the ot is not an
 	 * encrypted type. It will become authenticated instead in
 	 * arc_write_ready().
 	 */
 	buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE,
 	    B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 
 	return (buf);
 }
 
 static void
 l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
     boolean_t state_only)
 {
 	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
 	l2arc_dev_t *dev = l2hdr->b_dev;
 	uint64_t lsize = HDR_GET_LSIZE(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
 	arc_buf_contents_t type = hdr->b_type;
 	int64_t lsize_s;
 	int64_t psize_s;
 	int64_t asize_s;
 
 	if (incr) {
 		lsize_s = lsize;
 		psize_s = psize;
 		asize_s = asize;
 	} else {
 		lsize_s = -lsize;
 		psize_s = -psize;
 		asize_s = -asize;
 	}
 
 	/* If the buffer is a prefetch, count it as such. */
 	if (HDR_PREFETCH(hdr)) {
 		ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s);
 	} else {
 		/*
 		 * We use the value stored in the L2 header upon initial
 		 * caching in L2ARC. This value will be updated in case
 		 * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC
 		 * metadata (log entry) cannot currently be updated. Having
 		 * the ARC state in the L2 header solves the problem of a
 		 * possibly absent L1 header (apparent in buffers restored
 		 * from persistent L2ARC).
 		 */
 		switch (hdr->b_l2hdr.b_arcs_state) {
 			case ARC_STATE_MRU_GHOST:
 			case ARC_STATE_MRU:
 				ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s);
 				break;
 			case ARC_STATE_MFU_GHOST:
 			case ARC_STATE_MFU:
 				ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s);
 				break;
 			default:
 				break;
 		}
 	}
 
 	if (state_only)
 		return;
 
 	ARCSTAT_INCR(arcstat_l2_psize, psize_s);
 	ARCSTAT_INCR(arcstat_l2_lsize, lsize_s);
 
 	switch (type) {
 		case ARC_BUFC_DATA:
 			ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s);
 			break;
 		case ARC_BUFC_METADATA:
 			ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s);
 			break;
 		default:
 			break;
 	}
 }
 
 
 static void
 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
 	l2arc_dev_t *dev = l2hdr->b_dev;
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
 
 	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	list_remove(&dev->l2ad_buflist, hdr);
 
 	l2arc_hdr_arcstats_decrement(hdr);
 	vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
 
 	(void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
 	    hdr);
 	arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
 }
 
 static void
 arc_hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	if (HDR_HAS_L1HDR(hdr)) {
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 	}
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
 		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
 
 		if (!buflist_held)
 			mutex_enter(&dev->l2ad_mtx);
 
 		/*
 		 * Even though we checked this conditional above, we
 		 * need to check this again now that we have the
 		 * l2ad_mtx. This is because we could be racing with
 		 * another thread calling l2arc_evict() which might have
 		 * destroyed this header's L2 portion as we were waiting
 		 * to acquire the l2ad_mtx. If that happens, we don't
 		 * want to re-destroy the header's L2 portion.
 		 */
 		if (HDR_HAS_L2HDR(hdr)) {
 
 			if (!HDR_EMPTY(hdr))
 				buf_discard_identity(hdr);
 
 			arc_hdr_l2hdr_destroy(hdr);
 		}
 
 		if (!buflist_held)
 			mutex_exit(&dev->l2ad_mtx);
 	}
 
 	/*
 	 * The header's identify can only be safely discarded once it is no
 	 * longer discoverable.  This requires removing it from the hash table
 	 * and the l2arc header list.  After this point the hash lock can not
 	 * be used to protect the header.
 	 */
 	if (!HDR_EMPTY(hdr))
 		buf_discard_identity(hdr);
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		arc_cksum_free(hdr);
 
 		while (hdr->b_l1hdr.b_buf != NULL)
 			arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
 
 		if (hdr->b_l1hdr.b_pabd != NULL)
 			arc_hdr_free_abd(hdr, B_FALSE);
 
 		if (HDR_HAS_RABD(hdr))
 			arc_hdr_free_abd(hdr, B_TRUE);
 	}
 
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
 	if (HDR_HAS_L1HDR(hdr)) {
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 #ifdef ZFS_DEBUG
 		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 #endif
 		kmem_cache_free(hdr_full_cache, hdr);
 	} else {
 		kmem_cache_free(hdr_l2only_cache, hdr);
 	}
 }
 
 void
 arc_buf_destroy(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
 		ASSERT(ARC_BUF_LAST(buf));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		VERIFY0(remove_reference(hdr, tag));
 		return;
 	}
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	ASSERT3P(hdr, ==, buf->b_hdr);
 	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
 	ASSERT3P(buf->b_data, !=, NULL);
 
 	arc_buf_destroy_impl(buf);
 	(void) remove_reference(hdr, tag);
 	mutex_exit(hash_lock);
 }
 
 /*
  * Evict the arc_buf_hdr that is provided as a parameter. The resultant
  * state of the header is dependent on its state prior to entering this
  * function. The following transitions are possible:
  *
  *    - arc_mru -> arc_mru_ghost
  *    - arc_mfu -> arc_mfu_ghost
  *    - arc_mru_ghost -> arc_l2c_only
  *    - arc_mru_ghost -> deleted
  *    - arc_mfu_ghost -> arc_l2c_only
  *    - arc_mfu_ghost -> deleted
  *    - arc_uncached -> deleted
  *
  * Return total size of evicted data buffers for eviction progress tracking.
  * When evicting from ghost states return logical buffer size to make eviction
  * progress at the same (or at least comparable) rate as from non-ghost states.
  *
  * Return *real_evicted for actual ARC size reduction to wake up threads
  * waiting for it.  For non-ghost states it includes size of evicted data
  * buffers (the headers are not freed there).  For ghost states it includes
  * only the evicted headers size.
  */
 static int64_t
 arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted)
 {
 	arc_state_t *evicted_state, *state;
 	int64_t bytes_evicted = 0;
 	uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
 	    arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
 
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
 
 	*real_evicted = 0;
 	state = hdr->b_l1hdr.b_state;
 	if (GHOST_STATE(state)) {
 
 		/*
 		 * l2arc_write_buffers() relies on a header's L1 portion
 		 * (i.e. its b_pabd field) during it's write phase.
 		 * Thus, we cannot push a header onto the arc_l2c_only
 		 * state (removing its L1 piece) until the header is
 		 * done being written to the l2arc.
 		 */
 		if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
 			ARCSTAT_BUMP(arcstat_evict_l2_skip);
 			return (bytes_evicted);
 		}
 
 		ARCSTAT_BUMP(arcstat_deleted);
 		bytes_evicted += HDR_GET_LSIZE(hdr);
 
 		DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
 
 		if (HDR_HAS_L2HDR(hdr)) {
 			ASSERT(hdr->b_l1hdr.b_pabd == NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 			/*
 			 * This buffer is cached on the 2nd Level ARC;
 			 * don't destroy the header.
 			 */
 			arc_change_state(arc_l2c_only, hdr);
 			/*
 			 * dropping from L1+L2 cached to L2-only,
 			 * realloc to remove the L1 header.
 			 */
 			(void) arc_hdr_realloc(hdr, hdr_full_cache,
 			    hdr_l2only_cache);
 			*real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE;
 		} else {
 			arc_change_state(arc_anon, hdr);
 			arc_hdr_destroy(hdr);
 			*real_evicted += HDR_FULL_SIZE;
 		}
 		return (bytes_evicted);
 	}
 
 	ASSERT(state == arc_mru || state == arc_mfu || state == arc_uncached);
 	evicted_state = (state == arc_uncached) ? arc_anon :
 	    ((state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost);
 
 	/* prefetch buffers have a minimum lifespan */
 	if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
 	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
 	    MSEC_TO_TICK(min_lifetime)) {
 		ARCSTAT_BUMP(arcstat_evict_skip);
 		return (bytes_evicted);
 	}
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
 	} else {
 		if (l2arc_write_eligible(hdr->b_spa, hdr)) {
 			ARCSTAT_INCR(arcstat_evict_l2_eligible,
 			    HDR_GET_LSIZE(hdr));
 
 			switch (state->arcs_state) {
 				case ARC_STATE_MRU:
 					ARCSTAT_INCR(
 					    arcstat_evict_l2_eligible_mru,
 					    HDR_GET_LSIZE(hdr));
 					break;
 				case ARC_STATE_MFU:
 					ARCSTAT_INCR(
 					    arcstat_evict_l2_eligible_mfu,
 					    HDR_GET_LSIZE(hdr));
 					break;
 				default:
 					break;
 			}
 		} else {
 			ARCSTAT_INCR(arcstat_evict_l2_ineligible,
 			    HDR_GET_LSIZE(hdr));
 		}
 	}
 
 	bytes_evicted += arc_hdr_size(hdr);
 	*real_evicted += arc_hdr_size(hdr);
 
 	/*
 	 * If this hdr is being evicted and has a compressed buffer then we
 	 * discard it here before we change states.  This ensures that the
 	 * accounting is updated correctly in arc_free_data_impl().
 	 */
 	if (hdr->b_l1hdr.b_pabd != NULL)
 		arc_hdr_free_abd(hdr, B_FALSE);
 
 	if (HDR_HAS_RABD(hdr))
 		arc_hdr_free_abd(hdr, B_TRUE);
 
 	arc_change_state(evicted_state, hdr);
 	DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
 	if (evicted_state == arc_anon) {
 		arc_hdr_destroy(hdr);
 		*real_evicted += HDR_FULL_SIZE;
 	} else {
 		ASSERT(HDR_IN_HASH_TABLE(hdr));
 	}
 
 	return (bytes_evicted);
 }
 
 static void
 arc_set_need_free(void)
 {
 	ASSERT(MUTEX_HELD(&arc_evict_lock));
 	int64_t remaining = arc_free_memory() - arc_sys_free / 2;
 	arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters);
 	if (aw == NULL) {
 		arc_need_free = MAX(-remaining, 0);
 	} else {
 		arc_need_free =
 		    MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count));
 	}
 }
 
 static uint64_t
 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
     uint64_t spa, uint64_t bytes)
 {
 	multilist_sublist_t *mls;
 	uint64_t bytes_evicted = 0, real_evicted = 0;
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	uint_t evict_count = zfs_arc_evict_batch_limit;
 
 	ASSERT3P(marker, !=, NULL);
 
 	mls = multilist_sublist_lock(ml, idx);
 
 	for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
 	    hdr = multilist_sublist_prev(mls, marker)) {
 		if ((evict_count == 0) || (bytes_evicted >= bytes))
 			break;
 
 		/*
 		 * To keep our iteration location, move the marker
 		 * forward. Since we're not holding hdr's hash lock, we
 		 * must be very careful and not remove 'hdr' from the
 		 * sublist. Otherwise, other consumers might mistake the
 		 * 'hdr' as not being on a sublist when they call the
 		 * multilist_link_active() function (they all rely on
 		 * the hash lock protecting concurrent insertions and
 		 * removals). multilist_sublist_move_forward() was
 		 * specifically implemented to ensure this is the case
 		 * (only 'marker' will be removed and re-inserted).
 		 */
 		multilist_sublist_move_forward(mls, marker);
 
 		/*
 		 * The only case where the b_spa field should ever be
 		 * zero, is the marker headers inserted by
 		 * arc_evict_state(). It's possible for multiple threads
 		 * to be calling arc_evict_state() concurrently (e.g.
 		 * dsl_pool_close() and zio_inject_fault()), so we must
 		 * skip any markers we see from these other threads.
 		 */
 		if (hdr->b_spa == 0)
 			continue;
 
 		/* we're only interested in evicting buffers of a certain spa */
 		if (spa != 0 && hdr->b_spa != spa) {
 			ARCSTAT_BUMP(arcstat_evict_skip);
 			continue;
 		}
 
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We aren't calling this function from any code path
 		 * that would already be holding a hash lock, so we're
 		 * asserting on this assumption to be defensive in case
 		 * this ever changes. Without this check, it would be
 		 * possible to incorrectly increment arcstat_mutex_miss
 		 * below (e.g. if the code changed such that we called
 		 * this function with a hash lock held).
 		 */
 		ASSERT(!MUTEX_HELD(hash_lock));
 
 		if (mutex_tryenter(hash_lock)) {
 			uint64_t revicted;
 			uint64_t evicted = arc_evict_hdr(hdr, &revicted);
 			mutex_exit(hash_lock);
 
 			bytes_evicted += evicted;
 			real_evicted += revicted;
 
 			/*
 			 * If evicted is zero, arc_evict_hdr() must have
 			 * decided to skip this header, don't increment
 			 * evict_count in this case.
 			 */
 			if (evicted != 0)
 				evict_count--;
 
 		} else {
 			ARCSTAT_BUMP(arcstat_mutex_miss);
 		}
 	}
 
 	multilist_sublist_unlock(mls);
 
 	/*
 	 * Increment the count of evicted bytes, and wake up any threads that
 	 * are waiting for the count to reach this value.  Since the list is
 	 * ordered by ascending aew_count, we pop off the beginning of the
 	 * list until we reach the end, or a waiter that's past the current
 	 * "count".  Doing this outside the loop reduces the number of times
 	 * we need to acquire the global arc_evict_lock.
 	 *
 	 * Only wake when there's sufficient free memory in the system
 	 * (specifically, arc_sys_free/2, which by default is a bit more than
 	 * 1/64th of RAM).  See the comments in arc_wait_for_eviction().
 	 */
 	mutex_enter(&arc_evict_lock);
 	arc_evict_count += real_evicted;
 
 	if (arc_free_memory() > arc_sys_free / 2) {
 		arc_evict_waiter_t *aw;
 		while ((aw = list_head(&arc_evict_waiters)) != NULL &&
 		    aw->aew_count <= arc_evict_count) {
 			list_remove(&arc_evict_waiters, aw);
 			cv_broadcast(&aw->aew_cv);
 		}
 	}
 	arc_set_need_free();
 	mutex_exit(&arc_evict_lock);
 
 	/*
 	 * If the ARC size is reduced from arc_c_max to arc_c_min (especially
 	 * if the average cached block is small), eviction can be on-CPU for
 	 * many seconds.  To ensure that other threads that may be bound to
 	 * this CPU are able to make progress, make a voluntary preemption
 	 * call here.
 	 */
 	kpreempt(KPREEMPT_SYNC);
 
 	return (bytes_evicted);
 }
 
 /*
  * Allocate an array of buffer headers used as placeholders during arc state
  * eviction.
  */
 static arc_buf_hdr_t **
 arc_state_alloc_markers(int count)
 {
 	arc_buf_hdr_t **markers;
 
 	markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
 	for (int i = 0; i < count; i++) {
 		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
 
 		/*
 		 * A b_spa of 0 is used to indicate that this header is
 		 * a marker. This fact is used in arc_evict_state_impl().
 		 */
 		markers[i]->b_spa = 0;
 
 	}
 	return (markers);
 }
 
 static void
 arc_state_free_markers(arc_buf_hdr_t **markers, int count)
 {
 	for (int i = 0; i < count; i++)
 		kmem_cache_free(hdr_full_cache, markers[i]);
 	kmem_free(markers, sizeof (*markers) * count);
 }
 
 /*
  * Evict buffers from the given arc state, until we've removed the
  * specified number of bytes. Move the removed buffers to the
  * appropriate evict state.
  *
  * This function makes a "best effort". It skips over any buffers
  * it can't get a hash_lock on, and so, may not catch all candidates.
  * It may also return without evicting as much space as requested.
  *
  * If bytes is specified using the special value ARC_EVICT_ALL, this
  * will evict all available (i.e. unlocked and evictable) buffers from
  * the given arc state; which is used by arc_flush().
  */
 static uint64_t
 arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
     uint64_t bytes)
 {
 	uint64_t total_evicted = 0;
 	multilist_t *ml = &state->arcs_list[type];
 	int num_sublists;
 	arc_buf_hdr_t **markers;
 
 	num_sublists = multilist_get_num_sublists(ml);
 
 	/*
 	 * If we've tried to evict from each sublist, made some
 	 * progress, but still have not hit the target number of bytes
 	 * to evict, we want to keep trying. The markers allow us to
 	 * pick up where we left off for each individual sublist, rather
 	 * than starting from the tail each time.
 	 */
 	if (zthr_iscurthread(arc_evict_zthr)) {
 		markers = arc_state_evict_markers;
 		ASSERT3S(num_sublists, <=, arc_state_evict_marker_count);
 	} else {
 		markers = arc_state_alloc_markers(num_sublists);
 	}
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls;
 
 		mls = multilist_sublist_lock(ml, i);
 		multilist_sublist_insert_tail(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
 
 	/*
 	 * While we haven't hit our target number of bytes to evict, or
 	 * we're evicting all available buffers.
 	 */
 	while (total_evicted < bytes) {
 		int sublist_idx = multilist_get_random_index(ml);
 		uint64_t scan_evicted = 0;
 
 		/*
 		 * Start eviction using a randomly selected sublist,
 		 * this is to try and evenly balance eviction across all
 		 * sublists. Always starting at the same sublist
 		 * (e.g. index 0) would cause evictions to favor certain
 		 * sublists over others.
 		 */
 		for (int i = 0; i < num_sublists; i++) {
 			uint64_t bytes_remaining;
 			uint64_t bytes_evicted;
 
 			if (total_evicted < bytes)
 				bytes_remaining = bytes - total_evicted;
 			else
 				break;
 
 			bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
 			    markers[sublist_idx], spa, bytes_remaining);
 
 			scan_evicted += bytes_evicted;
 			total_evicted += bytes_evicted;
 
 			/* we've reached the end, wrap to the beginning */
 			if (++sublist_idx >= num_sublists)
 				sublist_idx = 0;
 		}
 
 		/*
 		 * If we didn't evict anything during this scan, we have
 		 * no reason to believe we'll evict more during another
 		 * scan, so break the loop.
 		 */
 		if (scan_evicted == 0) {
 			/* This isn't possible, let's make that obvious */
 			ASSERT3S(bytes, !=, 0);
 
 			/*
 			 * When bytes is ARC_EVICT_ALL, the only way to
 			 * break the loop is when scan_evicted is zero.
 			 * In that case, we actually have evicted enough,
 			 * so we don't want to increment the kstat.
 			 */
 			if (bytes != ARC_EVICT_ALL) {
 				ASSERT3S(total_evicted, <, bytes);
 				ARCSTAT_BUMP(arcstat_evict_not_enough);
 			}
 
 			break;
 		}
 	}
 
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
 		multilist_sublist_remove(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
 	if (markers != arc_state_evict_markers)
 		arc_state_free_markers(markers, num_sublists);
 
 	return (total_evicted);
 }
 
 /*
  * Flush all "evictable" data of the given type from the arc state
  * specified. This will not evict any "active" buffers (i.e. referenced).
  *
  * When 'retry' is set to B_FALSE, the function will make a single pass
  * over the state and evict any buffers that it can. Since it doesn't
  * continually retry the eviction, it might end up leaving some buffers
  * in the ARC due to lock misses.
  *
  * When 'retry' is set to B_TRUE, the function will continually retry the
  * eviction until *all* evictable buffers have been removed from the
  * state. As a result, if concurrent insertions into the state are
  * allowed (e.g. if the ARC isn't shutting down), this function might
  * wind up in an infinite loop, continually trying to evict buffers.
  */
 static uint64_t
 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
     boolean_t retry)
 {
 	uint64_t evicted = 0;
 
 	while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
 		evicted += arc_evict_state(state, type, spa, ARC_EVICT_ALL);
 
 		if (!retry)
 			break;
 	}
 
 	return (evicted);
 }
 
 /*
  * Evict the specified number of bytes from the state specified. This
  * function prevents us from trying to evict more from a state's list
  * than is "evictable", and to skip evicting altogether when passed a
  * negative value for "bytes". In contrast, arc_evict_state() will
  * evict everything it can, when passed a negative value for "bytes".
  */
 static uint64_t
 arc_evict_impl(arc_state_t *state, arc_buf_contents_t type, int64_t bytes)
 {
 	uint64_t delta;
 
 	if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
 		delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
 		    bytes);
 		return (arc_evict_state(state, type, 0, delta));
 	}
 
 	return (0);
 }
 
 /*
  * Adjust specified fraction, taking into account initial ghost state(s) size,
  * ghost hit bytes towards increasing the fraction, ghost hit bytes towards
  * decreasing it, plus a balance factor, controlling the decrease rate, used
  * to balance metadata vs data.
  */
 static uint64_t
 arc_evict_adj(uint64_t frac, uint64_t total, uint64_t up, uint64_t down,
     uint_t balance)
 {
 	if (total < 8 || up + down == 0)
 		return (frac);
 
 	/*
 	 * We should not have more ghost hits than ghost size, but they
 	 * may get close.  Restrict maximum adjustment in that case.
 	 */
 	if (up + down >= total / 4) {
 		uint64_t scale = (up + down) / (total / 8);
 		up /= scale;
 		down /= scale;
 	}
 
 	/* Get maximal dynamic range by choosing optimal shifts. */
 	int s = highbit64(total);
 	s = MIN(64 - s, 32);
 
 	uint64_t ofrac = (1ULL << 32) - frac;
 
 	if (frac >= 4 * ofrac)
 		up /= frac / (2 * ofrac + 1);
 	up = (up << s) / (total >> (32 - s));
 	if (ofrac >= 4 * frac)
 		down /= ofrac / (2 * frac + 1);
 	down = (down << s) / (total >> (32 - s));
 	down = down * 100 / balance;
 
 	return (frac + up - down);
 }
 
 /*
  * Evict buffers from the cache, such that arcstat_size is capped by arc_c.
  */
 static uint64_t
 arc_evict(void)
 {
 	uint64_t asize, bytes, total_evicted = 0;
 	int64_t e, mrud, mrum, mfud, mfum, w;
 	static uint64_t ogrd, ogrm, ogfd, ogfm;
 	static uint64_t gsrd, gsrm, gsfd, gsfm;
 	uint64_t ngrd, ngrm, ngfd, ngfm;
 
 	/* Get current size of ARC states we can evict from. */
 	mrud = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_DATA]) +
 	    zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]);
 	mrum = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) +
 	    zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
 	mfud = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
 	mfum = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
 	uint64_t d = mrud + mfud;
 	uint64_t m = mrum + mfum;
 	uint64_t t = d + m;
 
 	/* Get ARC ghost hits since last eviction. */
 	ngrd = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]);
 	uint64_t grd = ngrd - ogrd;
 	ogrd = ngrd;
 	ngrm = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]);
 	uint64_t grm = ngrm - ogrm;
 	ogrm = ngrm;
 	ngfd = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]);
 	uint64_t gfd = ngfd - ogfd;
 	ogfd = ngfd;
 	ngfm = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]);
 	uint64_t gfm = ngfm - ogfm;
 	ogfm = ngfm;
 
 	/* Adjust ARC states balance based on ghost hits. */
 	arc_meta = arc_evict_adj(arc_meta, gsrd + gsrm + gsfd + gsfm,
 	    grm + gfm, grd + gfd, zfs_arc_meta_balance);
 	arc_pd = arc_evict_adj(arc_pd, gsrd + gsfd, grd, gfd, 100);
 	arc_pm = arc_evict_adj(arc_pm, gsrm + gsfm, grm, gfm, 100);
 
 	asize = aggsum_value(&arc_sums.arcstat_size);
 	int64_t wt = t - (asize - arc_c);
 
 	/*
 	 * Try to reduce pinned dnodes if more than 3/4 of wanted metadata
 	 * target is not evictable or if they go over arc_dnode_limit.
 	 */
 	int64_t prune = 0;
 	int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size);
 	w = wt * (int64_t)(arc_meta >> 16) >> 16;
 	if (zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) +
 	    zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) -
 	    zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) -
 	    zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]) >
 	    w * 3 / 4) {
 		prune = dn / sizeof (dnode_t) *
 		    zfs_arc_dnode_reduce_percent / 100;
 	} else if (dn > arc_dnode_limit) {
 		prune = (dn - arc_dnode_limit) / sizeof (dnode_t) *
 		    zfs_arc_dnode_reduce_percent / 100;
 	}
 	if (prune > 0)
 		arc_prune_async(prune);
 
 	/* Evict MRU metadata. */
 	w = wt * (int64_t)(arc_meta * arc_pm >> 48) >> 16;
 	e = MIN((int64_t)(asize - arc_c), (int64_t)(mrum - w));
 	bytes = arc_evict_impl(arc_mru, ARC_BUFC_METADATA, e);
 	total_evicted += bytes;
 	mrum -= bytes;
 	asize -= bytes;
 
 	/* Evict MFU metadata. */
 	w = wt * (int64_t)(arc_meta >> 16) >> 16;
 	e = MIN((int64_t)(asize - arc_c), (int64_t)(m - w));
 	bytes = arc_evict_impl(arc_mfu, ARC_BUFC_METADATA, e);
 	total_evicted += bytes;
 	mfum -= bytes;
 	asize -= bytes;
 
 	/* Evict MRU data. */
 	wt -= m - total_evicted;
 	w = wt * (int64_t)(arc_pd >> 16) >> 16;
 	e = MIN((int64_t)(asize - arc_c), (int64_t)(mrud - w));
 	bytes = arc_evict_impl(arc_mru, ARC_BUFC_DATA, e);
 	total_evicted += bytes;
 	mrud -= bytes;
 	asize -= bytes;
 
 	/* Evict MFU data. */
 	e = asize - arc_c;
 	bytes = arc_evict_impl(arc_mfu, ARC_BUFC_DATA, e);
 	mfud -= bytes;
 	total_evicted += bytes;
 
 	/*
 	 * Evict ghost lists
 	 *
 	 * Size of each state's ghost list represents how much that state
 	 * may grow by shrinking the other states.  Would it need to shrink
 	 * other states to zero (that is unlikely), its ghost size would be
 	 * equal to sum of other three state sizes.  But excessive ghost
 	 * size may result in false ghost hits (too far back), that may
 	 * never result in real cache hits if several states are competing.
 	 * So choose some arbitraty point of 1/2 of other state sizes.
 	 */
 	gsrd = (mrum + mfud + mfum) / 2;
 	e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]) -
 	    gsrd;
 	(void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_DATA, e);
 
 	gsrm = (mrud + mfud + mfum) / 2;
 	e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]) -
 	    gsrm;
 	(void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_METADATA, e);
 
 	gsfd = (mrud + mrum + mfum) / 2;
 	e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]) -
 	    gsfd;
 	(void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_DATA, e);
 
 	gsfm = (mrud + mrum + mfud) / 2;
 	e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]) -
 	    gsfm;
 	(void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_METADATA, e);
 
 	return (total_evicted);
 }
 
 void
 arc_flush(spa_t *spa, boolean_t retry)
 {
 	uint64_t guid = 0;
 
 	/*
 	 * If retry is B_TRUE, a spa must not be specified since we have
 	 * no good way to determine if all of a spa's buffers have been
 	 * evicted from an arc state.
 	 */
 	ASSERT(!retry || spa == NULL);
 
 	if (spa != NULL)
 		guid = spa_load_guid(spa);
 
 	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_uncached, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry);
 }
 
 void
 arc_reduce_target_size(int64_t to_free)
 {
 	uint64_t c = arc_c;
 
 	if (c <= arc_c_min)
 		return;
 
 	/*
 	 * All callers want the ARC to actually evict (at least) this much
 	 * memory.  Therefore we reduce from the lower of the current size and
 	 * the target size.  This way, even if arc_c is much higher than
 	 * arc_size (as can be the case after many calls to arc_freed(), we will
 	 * immediately have arc_c < arc_size and therefore the arc_evict_zthr
 	 * will evict.
 	 */
 	uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
 	if (asize < c)
 		to_free += c - asize;
 	arc_c = MAX((int64_t)c - to_free, (int64_t)arc_c_min);
 
 	/* See comment in arc_evict_cb_check() on why lock+flag */
 	mutex_enter(&arc_evict_lock);
 	arc_evict_needed = B_TRUE;
 	mutex_exit(&arc_evict_lock);
 	zthr_wakeup(arc_evict_zthr);
 }
 
 /*
  * Determine if the system is under memory pressure and is asking
  * to reclaim memory. A return value of B_TRUE indicates that the system
  * is under memory pressure and that the arc should adjust accordingly.
  */
 boolean_t
 arc_reclaim_needed(void)
 {
 	return (arc_available_memory() < 0);
 }
 
 void
 arc_kmem_reap_soon(void)
 {
 	size_t			i;
 	kmem_cache_t		*prev_cache = NULL;
 	kmem_cache_t		*prev_data_cache = NULL;
 
 #ifdef _KERNEL
 #if defined(_ILP32)
 	/*
 	 * Reclaim unused memory from all kmem caches.
 	 */
 	kmem_reap();
 #endif
 #endif
 
 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
 #if defined(_ILP32)
 		/* reach upper limit of cache size on 32-bit */
 		if (zio_buf_cache[i] == NULL)
 			break;
 #endif
 		if (zio_buf_cache[i] != prev_cache) {
 			prev_cache = zio_buf_cache[i];
 			kmem_cache_reap_now(zio_buf_cache[i]);
 		}
 		if (zio_data_buf_cache[i] != prev_data_cache) {
 			prev_data_cache = zio_data_buf_cache[i];
 			kmem_cache_reap_now(zio_data_buf_cache[i]);
 		}
 	}
 	kmem_cache_reap_now(buf_cache);
 	kmem_cache_reap_now(hdr_full_cache);
 	kmem_cache_reap_now(hdr_l2only_cache);
 	kmem_cache_reap_now(zfs_btree_leaf_cache);
 	abd_cache_reap_now();
 }
 
 static boolean_t
 arc_evict_cb_check(void *arg, zthr_t *zthr)
 {
 	(void) arg, (void) zthr;
 
 #ifdef ZFS_DEBUG
 	/*
 	 * This is necessary in order to keep the kstat information
 	 * up to date for tools that display kstat data such as the
 	 * mdb ::arc dcmd and the Linux crash utility.  These tools
 	 * typically do not call kstat's update function, but simply
 	 * dump out stats from the most recent update.  Without
 	 * this call, these commands may show stale stats for the
 	 * anon, mru, mru_ghost, mfu, and mfu_ghost lists.  Even
 	 * with this call, the data might be out of date if the
 	 * evict thread hasn't been woken recently; but that should
 	 * suffice.  The arc_state_t structures can be queried
 	 * directly if more accurate information is needed.
 	 */
 	if (arc_ksp != NULL)
 		arc_ksp->ks_update(arc_ksp, KSTAT_READ);
 #endif
 
 	/*
 	 * We have to rely on arc_wait_for_eviction() to tell us when to
 	 * evict, rather than checking if we are overflowing here, so that we
 	 * are sure to not leave arc_wait_for_eviction() waiting on aew_cv.
 	 * If we have become "not overflowing" since arc_wait_for_eviction()
 	 * checked, we need to wake it up.  We could broadcast the CV here,
 	 * but arc_wait_for_eviction() may have not yet gone to sleep.  We
 	 * would need to use a mutex to ensure that this function doesn't
 	 * broadcast until arc_wait_for_eviction() has gone to sleep (e.g.
 	 * the arc_evict_lock).  However, the lock ordering of such a lock
 	 * would necessarily be incorrect with respect to the zthr_lock,
 	 * which is held before this function is called, and is held by
 	 * arc_wait_for_eviction() when it calls zthr_wakeup().
 	 */
 	if (arc_evict_needed)
 		return (B_TRUE);
 
 	/*
 	 * If we have buffers in uncached state, evict them periodically.
 	 */
 	return ((zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_DATA]) +
 	    zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]) &&
 	    ddi_get_lbolt() - arc_last_uncached_flush >
 	    MSEC_TO_TICK(arc_min_prefetch_ms / 2)));
 }
 
 /*
  * Keep arc_size under arc_c by running arc_evict which evicts data
  * from the ARC.
  */
 static void
 arc_evict_cb(void *arg, zthr_t *zthr)
 {
 	(void) arg;
 
 	uint64_t evicted = 0;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	/* Always try to evict from uncached state. */
 	arc_last_uncached_flush = ddi_get_lbolt();
 	evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_DATA, B_FALSE);
 	evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_METADATA, B_FALSE);
 
 	/* Evict from other states only if told to. */
 	if (arc_evict_needed)
 		evicted += arc_evict();
 
 	/*
 	 * If evicted is zero, we couldn't evict anything
 	 * via arc_evict(). This could be due to hash lock
 	 * collisions, but more likely due to the majority of
 	 * arc buffers being unevictable. Therefore, even if
 	 * arc_size is above arc_c, another pass is unlikely to
 	 * be helpful and could potentially cause us to enter an
 	 * infinite loop.  Additionally, zthr_iscancelled() is
 	 * checked here so that if the arc is shutting down, the
 	 * broadcast will wake any remaining arc evict waiters.
 	 *
 	 * Note we cancel using zthr instead of arc_evict_zthr
 	 * because the latter may not yet be initializd when the
 	 * callback is first invoked.
 	 */
 	mutex_enter(&arc_evict_lock);
 	arc_evict_needed = !zthr_iscancelled(zthr) &&
 	    evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0;
 	if (!arc_evict_needed) {
 		/*
 		 * We're either no longer overflowing, or we
 		 * can't evict anything more, so we should wake
 		 * arc_get_data_impl() sooner.
 		 */
 		arc_evict_waiter_t *aw;
 		while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) {
 			cv_broadcast(&aw->aew_cv);
 		}
 		arc_set_need_free();
 	}
 	mutex_exit(&arc_evict_lock);
 	spl_fstrans_unmark(cookie);
 }
 
 static boolean_t
 arc_reap_cb_check(void *arg, zthr_t *zthr)
 {
 	(void) arg, (void) zthr;
 
 	int64_t free_memory = arc_available_memory();
 	static int reap_cb_check_counter = 0;
 
 	/*
 	 * If a kmem reap is already active, don't schedule more.  We must
 	 * check for this because kmem_cache_reap_soon() won't actually
 	 * block on the cache being reaped (this is to prevent callers from
 	 * becoming implicitly blocked by a system-wide kmem reap -- which,
 	 * on a system with many, many full magazines, can take minutes).
 	 */
 	if (!kmem_cache_reap_active() && free_memory < 0) {
 
 		arc_no_grow = B_TRUE;
 		arc_warm = B_TRUE;
 		/*
 		 * Wait at least zfs_grow_retry (default 5) seconds
 		 * before considering growing.
 		 */
 		arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
 		return (B_TRUE);
 	} else if (free_memory < arc_c >> arc_no_grow_shift) {
 		arc_no_grow = B_TRUE;
 	} else if (gethrtime() >= arc_growtime) {
 		arc_no_grow = B_FALSE;
 	}
 
 	/*
 	 * Called unconditionally every 60 seconds to reclaim unused
 	 * zstd compression and decompression context. This is done
 	 * here to avoid the need for an independent thread.
 	 */
 	if (!((reap_cb_check_counter++) % 60))
 		zfs_zstd_cache_reap_now();
 
 	return (B_FALSE);
 }
 
 /*
  * Keep enough free memory in the system by reaping the ARC's kmem
  * caches.  To cause more slabs to be reapable, we may reduce the
  * target size of the cache (arc_c), causing the arc_evict_cb()
  * to free more buffers.
  */
 static void
 arc_reap_cb(void *arg, zthr_t *zthr)
 {
 	(void) arg, (void) zthr;
 
 	int64_t free_memory;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	/*
 	 * Kick off asynchronous kmem_reap()'s of all our caches.
 	 */
 	arc_kmem_reap_soon();
 
 	/*
 	 * Wait at least arc_kmem_cache_reap_retry_ms between
 	 * arc_kmem_reap_soon() calls. Without this check it is possible to
 	 * end up in a situation where we spend lots of time reaping
 	 * caches, while we're near arc_c_min.  Waiting here also gives the
 	 * subsequent free memory check a chance of finding that the
 	 * asynchronous reap has already freed enough memory, and we don't
 	 * need to call arc_reduce_target_size().
 	 */
 	delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
 
 	/*
 	 * Reduce the target size as needed to maintain the amount of free
 	 * memory in the system at a fraction of the arc_size (1/128th by
 	 * default).  If oversubscribed (free_memory < 0) then reduce the
 	 * target arc_size by the deficit amount plus the fractional
 	 * amount.  If free memory is positive but less than the fractional
 	 * amount, reduce by what is needed to hit the fractional amount.
 	 */
 	free_memory = arc_available_memory();
 
 	int64_t can_free = arc_c - arc_c_min;
 	if (can_free > 0) {
 		int64_t to_free = (can_free >> arc_shrink_shift) - free_memory;
 		if (to_free > 0)
 			arc_reduce_target_size(to_free);
 	}
 	spl_fstrans_unmark(cookie);
 }
 
 #ifdef _KERNEL
 /*
  * Determine the amount of memory eligible for eviction contained in the
  * ARC. All clean data reported by the ghost lists can always be safely
  * evicted. Due to arc_c_min, the same does not hold for all clean data
  * contained by the regular mru and mfu lists.
  *
  * In the case of the regular mru and mfu lists, we need to report as
  * much clean data as possible, such that evicting that same reported
  * data will not bring arc_size below arc_c_min. Thus, in certain
  * circumstances, the total amount of clean data in the mru and mfu
  * lists might not actually be evictable.
  *
  * The following two distinct cases are accounted for:
  *
  * 1. The sum of the amount of dirty data contained by both the mru and
  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
  *    is greater than or equal to arc_c_min.
  *    (i.e. amount of dirty data >= arc_c_min)
  *
  *    This is the easy case; all clean data contained by the mru and mfu
  *    lists is evictable. Evicting all clean data can only drop arc_size
  *    to the amount of dirty data, which is greater than arc_c_min.
  *
  * 2. The sum of the amount of dirty data contained by both the mru and
  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
  *    is less than arc_c_min.
  *    (i.e. arc_c_min > amount of dirty data)
  *
  *    2.1. arc_size is greater than or equal arc_c_min.
  *         (i.e. arc_size >= arc_c_min > amount of dirty data)
  *
  *         In this case, not all clean data from the regular mru and mfu
  *         lists is actually evictable; we must leave enough clean data
  *         to keep arc_size above arc_c_min. Thus, the maximum amount of
  *         evictable data from the two lists combined, is exactly the
  *         difference between arc_size and arc_c_min.
  *
  *    2.2. arc_size is less than arc_c_min
  *         (i.e. arc_c_min > arc_size > amount of dirty data)
  *
  *         In this case, none of the data contained in the mru and mfu
  *         lists is evictable, even if it's clean. Since arc_size is
  *         already below arc_c_min, evicting any more would only
  *         increase this negative difference.
  */
 
 #endif /* _KERNEL */
 
 /*
  * Adapt arc info given the number of bytes we are trying to add and
  * the state that we are coming from.  This function is only called
  * when we are adding new content to the cache.
  */
 static void
 arc_adapt(uint64_t bytes)
 {
 	/*
 	 * Wake reap thread if we do not have any available memory
 	 */
 	if (arc_reclaim_needed()) {
 		zthr_wakeup(arc_reap_zthr);
 		return;
 	}
 
 	if (arc_no_grow)
 		return;
 
 	if (arc_c >= arc_c_max)
 		return;
 
 	/*
 	 * If we're within (2 * maxblocksize) bytes of the target
 	 * cache size, increment the target cache size
 	 */
 	if (aggsum_upper_bound(&arc_sums.arcstat_size) +
 	    2 * SPA_MAXBLOCKSIZE >= arc_c) {
 		uint64_t dc = MAX(bytes, SPA_OLD_MAXBLOCKSIZE);
 		if (atomic_add_64_nv(&arc_c, dc) > arc_c_max)
 			arc_c = arc_c_max;
 	}
 }
 
 /*
  * Check if arc_size has grown past our upper threshold, determined by
  * zfs_arc_overflow_shift.
  */
 static arc_ovf_level_t
 arc_is_overflowing(boolean_t use_reserve)
 {
 	/* Always allow at least one block of overflow */
 	int64_t overflow = MAX(SPA_MAXBLOCKSIZE,
 	    arc_c >> zfs_arc_overflow_shift);
 
 	/*
 	 * We just compare the lower bound here for performance reasons. Our
 	 * primary goals are to make sure that the arc never grows without
 	 * bound, and that it can reach its maximum size. This check
 	 * accomplishes both goals. The maximum amount we could run over by is
 	 * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
 	 * in the ARC. In practice, that's in the tens of MB, which is low
 	 * enough to be safe.
 	 */
 	int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) -
 	    arc_c - overflow / 2;
 	if (!use_reserve)
 		overflow /= 2;
 	return (over < 0 ? ARC_OVF_NONE :
 	    over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
 }
 
 static abd_t *
 arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, const void *tag,
     int alloc_flags)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_get_data_impl(hdr, size, tag, alloc_flags);
 	if (alloc_flags & ARC_HDR_ALLOC_LINEAR)
 		return (abd_alloc_linear(size, type == ARC_BUFC_METADATA));
 	else
 		return (abd_alloc(size, type == ARC_BUFC_METADATA));
 }
 
 static void *
 arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_get_data_impl(hdr, size, tag, 0);
 	if (type == ARC_BUFC_METADATA) {
 		return (zio_buf_alloc(size));
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		return (zio_data_buf_alloc(size));
 	}
 }
 
 /*
  * Wait for the specified amount of data (in bytes) to be evicted from the
  * ARC, and for there to be sufficient free memory in the system.  Waiting for
  * eviction ensures that the memory used by the ARC decreases.  Waiting for
  * free memory ensures that the system won't run out of free pages, regardless
  * of ARC behavior and settings.  See arc_lowmem_init().
  */
 void
 arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve)
 {
 	switch (arc_is_overflowing(use_reserve)) {
 	case ARC_OVF_NONE:
 		return;
 	case ARC_OVF_SOME:
 		/*
 		 * This is a bit racy without taking arc_evict_lock, but the
 		 * worst that can happen is we either call zthr_wakeup() extra
 		 * time due to race with other thread here, or the set flag
 		 * get cleared by arc_evict_cb(), which is unlikely due to
 		 * big hysteresis, but also not important since at this level
 		 * of overflow the eviction is purely advisory.  Same time
 		 * taking the global lock here every time without waiting for
 		 * the actual eviction creates a significant lock contention.
 		 */
 		if (!arc_evict_needed) {
 			arc_evict_needed = B_TRUE;
 			zthr_wakeup(arc_evict_zthr);
 		}
 		return;
 	case ARC_OVF_SEVERE:
 	default:
 	{
 		arc_evict_waiter_t aw;
 		list_link_init(&aw.aew_node);
 		cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
 
 		uint64_t last_count = 0;
 		mutex_enter(&arc_evict_lock);
 		if (!list_is_empty(&arc_evict_waiters)) {
 			arc_evict_waiter_t *last =
 			    list_tail(&arc_evict_waiters);
 			last_count = last->aew_count;
 		} else if (!arc_evict_needed) {
 			arc_evict_needed = B_TRUE;
 			zthr_wakeup(arc_evict_zthr);
 		}
 		/*
 		 * Note, the last waiter's count may be less than
 		 * arc_evict_count if we are low on memory in which
 		 * case arc_evict_state_impl() may have deferred
 		 * wakeups (but still incremented arc_evict_count).
 		 */
 		aw.aew_count = MAX(last_count, arc_evict_count) + amount;
 
 		list_insert_tail(&arc_evict_waiters, &aw);
 
 		arc_set_need_free();
 
 		DTRACE_PROBE3(arc__wait__for__eviction,
 		    uint64_t, amount,
 		    uint64_t, arc_evict_count,
 		    uint64_t, aw.aew_count);
 
 		/*
 		 * We will be woken up either when arc_evict_count reaches
 		 * aew_count, or when the ARC is no longer overflowing and
 		 * eviction completes.
 		 * In case of "false" wakeup, we will still be on the list.
 		 */
 		do {
 			cv_wait(&aw.aew_cv, &arc_evict_lock);
 		} while (list_link_active(&aw.aew_node));
 		mutex_exit(&arc_evict_lock);
 
 		cv_destroy(&aw.aew_cv);
 	}
 	}
 }
 
 /*
  * Allocate a block and return it to the caller. If we are hitting the
  * hard limit for the cache size, we must sleep, waiting for the eviction
  * thread to catch up. If we're past the target size but below the hard
  * limit, we'll only signal the reclaim thread and continue on.
  */
 static void
 arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag,
     int alloc_flags)
 {
 	arc_adapt(size);
 
 	/*
 	 * If arc_size is currently overflowing, we must be adding data
 	 * faster than we are evicting.  To ensure we don't compound the
 	 * problem by adding more data and forcing arc_size to grow even
 	 * further past it's target size, we wait for the eviction thread to
 	 * make some progress.  We also wait for there to be sufficient free
 	 * memory in the system, as measured by arc_free_memory().
 	 *
 	 * Specifically, we wait for zfs_arc_eviction_pct percent of the
 	 * requested size to be evicted.  This should be more than 100%, to
 	 * ensure that that progress is also made towards getting arc_size
 	 * under arc_c.  See the comment above zfs_arc_eviction_pct.
 	 */
 	arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100,
 	    alloc_flags & ARC_HDR_USE_RESERVE);
 
 	arc_buf_contents_t type = arc_buf_type(hdr);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_consume(size, ARC_SPACE_META);
 	} else {
 		arc_space_consume(size, ARC_SPACE_DATA);
 	}
 
 	/*
 	 * Update the state size.  Note that ghost states have a
 	 * "ghost size" and so don't need to be updated.
 	 */
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	if (!GHOST_STATE(state)) {
 
 		(void) zfs_refcount_add_many(&state->arcs_size[type], size,
 		    tag);
 
 		/*
 		 * If this is reached via arc_read, the link is
 		 * protected by the hash lock. If reached via
 		 * arc_buf_alloc, the header should not be accessed by
 		 * any other thread. And, if reached via arc_read_done,
 		 * the hash lock will protect it if it's found in the
 		 * hash table; otherwise no other thread should be
 		 * trying to [add|remove]_reference it.
 		 */
 		if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 			ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 			(void) zfs_refcount_add_many(&state->arcs_esize[type],
 			    size, tag);
 		}
 	}
 }
 
 static void
 arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size,
     const void *tag)
 {
 	arc_free_data_impl(hdr, size, tag);
 	abd_free(abd);
 }
 
 static void
 arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, const void *tag)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_free_data_impl(hdr, size, tag);
 	if (type == ARC_BUFC_METADATA) {
 		zio_buf_free(buf, size);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		zio_data_buf_free(buf, size);
 	}
 }
 
 /*
  * Free the arc data buffer.
  */
 static void
 arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	/* protected by hash lock, if in the hash table */
 	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT(state != arc_anon && state != arc_l2c_only);
 
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    size, tag);
 	}
 	(void) zfs_refcount_remove_many(&state->arcs_size[type], size, tag);
 
 	VERIFY3U(hdr->b_type, ==, type);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_return(size, ARC_SPACE_META);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		arc_space_return(size, ARC_SPACE_DATA);
 	}
 }
 
 /*
  * This routine is called whenever a buffer is accessed.
  */
 static void
 arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit)
 {
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	/*
 	 * Update buffer prefetch status.
 	 */
 	boolean_t was_prefetch = HDR_PREFETCH(hdr);
 	boolean_t now_prefetch = arc_flags & ARC_FLAG_PREFETCH;
 	if (was_prefetch != now_prefetch) {
 		if (was_prefetch) {
 			ARCSTAT_CONDSTAT(hit, demand_hit, demand_iohit,
 			    HDR_PRESCIENT_PREFETCH(hdr), prescient, predictive,
 			    prefetch);
 		}
 		if (HDR_HAS_L2HDR(hdr))
 			l2arc_hdr_arcstats_decrement_state(hdr);
 		if (was_prefetch) {
 			arc_hdr_clear_flags(hdr,
 			    ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH);
 		} else {
 			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
 		}
 		if (HDR_HAS_L2HDR(hdr))
 			l2arc_hdr_arcstats_increment_state(hdr);
 	}
 	if (now_prefetch) {
 		if (arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
 			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
 			ARCSTAT_BUMP(arcstat_prescient_prefetch);
 		} else {
 			ARCSTAT_BUMP(arcstat_predictive_prefetch);
 		}
 	}
 	if (arc_flags & ARC_FLAG_L2CACHE)
 		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 
 	clock_t now = ddi_get_lbolt();
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		arc_state_t	*new_state;
 		/*
 		 * This buffer is not in the cache, and does not appear in
 		 * our "ghost" lists.  Add it to the MRU or uncached state.
 		 */
 		ASSERT0(hdr->b_l1hdr.b_arc_access);
 		hdr->b_l1hdr.b_arc_access = now;
 		if (HDR_UNCACHED(hdr)) {
 			new_state = arc_uncached;
 			DTRACE_PROBE1(new_state__uncached, arc_buf_hdr_t *,
 			    hdr);
 		} else {
 			new_state = arc_mru;
 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		}
 		arc_change_state(new_state, hdr);
 	} else if (hdr->b_l1hdr.b_state == arc_mru) {
 		/*
 		 * This buffer has been accessed once recently and either
 		 * its read is still in progress or it is in the cache.
 		 */
 		if (HDR_IO_IN_PROGRESS(hdr)) {
 			hdr->b_l1hdr.b_arc_access = now;
 			return;
 		}
 		hdr->b_l1hdr.b_mru_hits++;
 		ARCSTAT_BUMP(arcstat_mru_hits);
 
 		/*
 		 * If the previous access was a prefetch, then it already
 		 * handled possible promotion, so nothing more to do for now.
 		 */
 		if (was_prefetch) {
 			hdr->b_l1hdr.b_arc_access = now;
 			return;
 		}
 
 		/*
 		 * If more than ARC_MINTIME have passed from the previous
 		 * hit, promote the buffer to the MFU state.
 		 */
 		if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
 		    ARC_MINTIME)) {
 			hdr->b_l1hdr.b_arc_access = now;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 			arc_change_state(arc_mfu, hdr);
 		}
 	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
 		arc_state_t	*new_state;
 		/*
 		 * This buffer has been accessed once recently, but was
 		 * evicted from the cache.  Would we have bigger MRU, it
 		 * would be an MRU hit, so handle it the same way, except
 		 * we don't need to check the previous access time.
 		 */
 		hdr->b_l1hdr.b_mru_ghost_hits++;
 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
 		hdr->b_l1hdr.b_arc_access = now;
 		wmsum_add(&arc_mru_ghost->arcs_hits[arc_buf_type(hdr)],
 		    arc_hdr_size(hdr));
 		if (was_prefetch) {
 			new_state = arc_mru;
 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		} else {
 			new_state = arc_mfu;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		}
 		arc_change_state(new_state, hdr);
 	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
 		/*
 		 * This buffer has been accessed more than once and either
 		 * still in the cache or being restored from one of ghosts.
 		 */
 		if (!HDR_IO_IN_PROGRESS(hdr)) {
 			hdr->b_l1hdr.b_mfu_hits++;
 			ARCSTAT_BUMP(arcstat_mfu_hits);
 		}
 		hdr->b_l1hdr.b_arc_access = now;
 	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
 		/*
 		 * This buffer has been accessed more than once recently, but
 		 * has been evicted from the cache.  Would we have bigger MFU
 		 * it would stay in cache, so move it back to MFU state.
 		 */
 		hdr->b_l1hdr.b_mfu_ghost_hits++;
 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
 		hdr->b_l1hdr.b_arc_access = now;
 		wmsum_add(&arc_mfu_ghost->arcs_hits[arc_buf_type(hdr)],
 		    arc_hdr_size(hdr));
 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		arc_change_state(arc_mfu, hdr);
 	} else if (hdr->b_l1hdr.b_state == arc_uncached) {
 		/*
 		 * This buffer is uncacheable, but we got a hit.  Probably
 		 * a demand read after prefetch.  Nothing more to do here.
 		 */
 		if (!HDR_IO_IN_PROGRESS(hdr))
 			ARCSTAT_BUMP(arcstat_uncached_hits);
 		hdr->b_l1hdr.b_arc_access = now;
 	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
 		/*
 		 * This buffer is on the 2nd Level ARC and was not accessed
 		 * for a long time, so treat it as new and put into MRU.
 		 */
 		hdr->b_l1hdr.b_arc_access = now;
 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		arc_change_state(arc_mru, hdr);
 	} else {
 		cmn_err(CE_PANIC, "invalid arc state 0x%p",
 		    hdr->b_l1hdr.b_state);
 	}
 }
 
 /*
  * This routine is called by dbuf_hold() to update the arc_access() state
  * which otherwise would be skipped for entries in the dbuf cache.
  */
 void
 arc_buf_access(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * Avoid taking the hash_lock when possible as an optimization.
 	 * The header must be checked again under the hash_lock in order
 	 * to handle the case where it is concurrently being released.
 	 */
 	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr))
 		return;
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_access_skip);
 		return;
 	}
 
 	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
 	    hdr->b_l1hdr.b_state == arc_mfu ||
 	    hdr->b_l1hdr.b_state == arc_uncached);
 
 	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 	arc_access(hdr, 0, B_TRUE);
 	mutex_exit(hash_lock);
 
 	ARCSTAT_BUMP(arcstat_hits);
 	ARCSTAT_CONDSTAT(B_TRUE /* demand */, demand, prefetch,
 	    !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
 }
 
 /* a generic arc_read_done_func_t which you can use */
 void
 arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *arg)
 {
 	(void) zio, (void) zb, (void) bp;
 
 	if (buf == NULL)
 		return;
 
 	memcpy(arg, buf->b_data, arc_buf_size(buf));
 	arc_buf_destroy(buf, arg);
 }
 
 /* a generic arc_read_done_func_t */
 void
 arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *arg)
 {
 	(void) zb, (void) bp;
 	arc_buf_t **bufp = arg;
 
 	if (buf == NULL) {
 		ASSERT(zio == NULL || zio->io_error != 0);
 		*bufp = NULL;
 	} else {
 		ASSERT(zio == NULL || zio->io_error == 0);
 		*bufp = buf;
 		ASSERT(buf->b_data != NULL);
 	}
 }
 
 static void
 arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
 {
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
 		ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
 		ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF);
 	} else {
 		if (HDR_COMPRESSION_ENABLED(hdr)) {
 			ASSERT3U(arc_hdr_get_compress(hdr), ==,
 			    BP_GET_COMPRESS(bp));
 		}
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
 		ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
 		ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp));
 	}
 }
 
 static void
 arc_read_done(zio_t *zio)
 {
 	blkptr_t 	*bp = zio->io_bp;
 	arc_buf_hdr_t	*hdr = zio->io_private;
 	kmutex_t	*hash_lock = NULL;
 	arc_callback_t	*callback_list;
 	arc_callback_t	*acb;
 
 	/*
 	 * The hdr was inserted into hash-table and removed from lists
 	 * prior to starting I/O.  We should find this header, since
 	 * it's in the hash table, and it should be legit since it's
 	 * not possible to evict it during the I/O.  The only possible
 	 * reason for it not to be found is if we were freed during the
 	 * read.
 	 */
 	if (HDR_IN_HASH_TABLE(hdr)) {
 		arc_buf_hdr_t *found;
 
-		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
+		ASSERT3U(hdr->b_birth, ==, BP_GET_BIRTH(zio->io_bp));
 		ASSERT3U(hdr->b_dva.dva_word[0], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
 		ASSERT3U(hdr->b_dva.dva_word[1], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
 
 		found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock);
 
 		ASSERT((found == hdr &&
 		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
 		    (found == hdr && HDR_L2_READING(hdr)));
 		ASSERT3P(hash_lock, !=, NULL);
 	}
 
 	if (BP_IS_PROTECTED(bp)) {
 		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
 		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
 		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv);
 
 		if (zio->io_error == 0) {
 			if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
 				void *tmpbuf;
 
 				tmpbuf = abd_borrow_buf_copy(zio->io_abd,
 				    sizeof (zil_chain_t));
 				zio_crypt_decode_mac_zil(tmpbuf,
 				    hdr->b_crypt_hdr.b_mac);
 				abd_return_buf(zio->io_abd, tmpbuf,
 				    sizeof (zil_chain_t));
 			} else {
 				zio_crypt_decode_mac_bp(bp,
 				    hdr->b_crypt_hdr.b_mac);
 			}
 		}
 	}
 
 	if (zio->io_error == 0) {
 		/* byteswap if necessary */
 		if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
 			if (BP_GET_LEVEL(zio->io_bp) > 0) {
 				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
 			} else {
 				hdr->b_l1hdr.b_byteswap =
 				    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
 			}
 		} else {
 			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 		}
 		if (!HDR_L2_READING(hdr)) {
 			hdr->b_complevel = zio->io_prop.zp_complevel;
 		}
 	}
 
 	arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
 	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
 		arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
 
 	callback_list = hdr->b_l1hdr.b_acb;
 	ASSERT3P(callback_list, !=, NULL);
 	hdr->b_l1hdr.b_acb = NULL;
 
 	/*
 	 * If a read request has a callback (i.e. acb_done is not NULL), then we
 	 * make a buf containing the data according to the parameters which were
 	 * passed in. The implementation of arc_buf_alloc_impl() ensures that we
 	 * aren't needlessly decompressing the data multiple times.
 	 */
 	int callback_cnt = 0;
 	for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
 
 		/* We need the last one to call below in original order. */
 		callback_list = acb;
 
 		if (!acb->acb_done || acb->acb_nobuf)
 			continue;
 
 		callback_cnt++;
 
 		if (zio->io_error != 0)
 			continue;
 
 		int error = arc_buf_alloc_impl(hdr, zio->io_spa,
 		    &acb->acb_zb, acb->acb_private, acb->acb_encrypted,
 		    acb->acb_compressed, acb->acb_noauth, B_TRUE,
 		    &acb->acb_buf);
 
 		/*
 		 * Assert non-speculative zios didn't fail because an
 		 * encryption key wasn't loaded
 		 */
 		ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) ||
 		    error != EACCES);
 
 		/*
 		 * If we failed to decrypt, report an error now (as the zio
 		 * layer would have done if it had done the transforms).
 		 */
 		if (error == ECKSUM) {
 			ASSERT(BP_IS_PROTECTED(bp));
 			error = SET_ERROR(EIO);
 			if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 				spa_log_error(zio->io_spa, &acb->acb_zb,
-				    &zio->io_bp->blk_birth);
+				    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 				(void) zfs_ereport_post(
 				    FM_EREPORT_ZFS_AUTHENTICATION,
 				    zio->io_spa, NULL, &acb->acb_zb, zio, 0);
 			}
 		}
 
 		if (error != 0) {
 			/*
 			 * Decompression or decryption failed.  Set
 			 * io_error so that when we call acb_done
 			 * (below), we will indicate that the read
 			 * failed. Note that in the unusual case
 			 * where one callback is compressed and another
 			 * uncompressed, we will mark all of them
 			 * as failed, even though the uncompressed
 			 * one can't actually fail.  In this case,
 			 * the hdr will not be anonymous, because
 			 * if there are multiple callbacks, it's
 			 * because multiple threads found the same
 			 * arc buf in the hash table.
 			 */
 			zio->io_error = error;
 		}
 	}
 
 	/*
 	 * If there are multiple callbacks, we must have the hash lock,
 	 * because the only way for multiple threads to find this hdr is
 	 * in the hash table.  This ensures that if there are multiple
 	 * callbacks, the hdr is not anonymous.  If it were anonymous,
 	 * we couldn't use arc_buf_destroy() in the error case below.
 	 */
 	ASSERT(callback_cnt < 2 || hash_lock != NULL);
 
 	if (zio->io_error == 0) {
 		arc_hdr_verify(hdr, zio->io_bp);
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 		if (hdr->b_l1hdr.b_state != arc_anon)
 			arc_change_state(arc_anon, hdr);
 		if (HDR_IN_HASH_TABLE(hdr))
 			buf_hash_remove(hdr);
 	}
 
 	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 	(void) remove_reference(hdr, hdr);
 
 	if (hash_lock != NULL)
 		mutex_exit(hash_lock);
 
 	/* execute each callback and free its structure */
 	while ((acb = callback_list) != NULL) {
 		if (acb->acb_done != NULL) {
 			if (zio->io_error != 0 && acb->acb_buf != NULL) {
 				/*
 				 * If arc_buf_alloc_impl() fails during
 				 * decompression, the buf will still be
 				 * allocated, and needs to be freed here.
 				 */
 				arc_buf_destroy(acb->acb_buf,
 				    acb->acb_private);
 				acb->acb_buf = NULL;
 			}
 			acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
 			    acb->acb_buf, acb->acb_private);
 		}
 
 		if (acb->acb_zio_dummy != NULL) {
 			acb->acb_zio_dummy->io_error = zio->io_error;
 			zio_nowait(acb->acb_zio_dummy);
 		}
 
 		callback_list = acb->acb_prev;
 		if (acb->acb_wait) {
 			mutex_enter(&acb->acb_wait_lock);
 			acb->acb_wait_error = zio->io_error;
 			acb->acb_wait = B_FALSE;
 			cv_signal(&acb->acb_wait_cv);
 			mutex_exit(&acb->acb_wait_lock);
 			/* acb will be freed by the waiting thread. */
 		} else {
 			kmem_free(acb, sizeof (arc_callback_t));
 		}
 	}
 }
 
 /*
  * "Read" the block at the specified DVA (in bp) via the
  * cache.  If the block is found in the cache, invoke the provided
  * callback immediately and return.  Note that the `zio' parameter
  * in the callback will be NULL in this case, since no IO was
  * required.  If the block is not in the cache pass the read request
  * on to the spa with a substitute callback function, so that the
  * requested block will be added to the cache.
  *
  * If a read request arrives for a block that has a read in-progress,
  * either wait for the in-progress read to complete (and return the
  * results); or, if this is a read with a "done" func, add a record
  * to the read to invoke the "done" func when the read completes,
  * and return; or just return.
  *
  * arc_read_done() will invoke all the requested "done" functions
  * for readers of this block.
  */
 int
 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     arc_read_done_func_t *done, void *private, zio_priority_t priority,
     int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = NULL;
 	kmutex_t *hash_lock = NULL;
 	zio_t *rzio;
 	uint64_t guid = spa_load_guid(spa);
 	boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0;
 	boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) &&
 	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
 	boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) &&
 	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
 	boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp);
 	boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF;
 	arc_buf_t *buf = NULL;
 	int rc = 0;
 
 	ASSERT(!embedded_bp ||
 	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(!BP_IS_REDACTED(bp));
 
 	/*
 	 * Normally SPL_FSTRANS will already be set since kernel threads which
 	 * expect to call the DMU interfaces will set it when created.  System
 	 * calls are similarly handled by setting/cleaning the bit in the
 	 * registered callback (module/os/.../zfs/zpl_*).
 	 *
 	 * External consumers such as Lustre which call the exported DMU
 	 * interfaces may not have set SPL_FSTRANS.  To avoid a deadlock
 	 * on the hash_lock always set and clear the bit.
 	 */
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 top:
 	/*
 	 * Verify the block pointer contents are reasonable.  This should
 	 * always be the case since the blkptr is protected by a checksum.
 	 * However, if there is damage it's desirable to detect this early
 	 * and treat it as a checksum error.  This allows an alternate blkptr
 	 * to be tried when one is available (e.g. ditto blocks).
 	 */
 	if (!zfs_blkptr_verify(spa, bp, (zio_flags & ZIO_FLAG_CONFIG_WRITER) ?
 	    BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
 		rc = SET_ERROR(ECKSUM);
 		goto done;
 	}
 
 	if (!embedded_bp) {
 		/*
 		 * Embedded BP's have no DVA and require no I/O to "read".
 		 * Create an anonymous arc buf to back it.
 		 */
 		hdr = buf_hash_find(guid, bp, &hash_lock);
 	}
 
 	/*
 	 * Determine if we have an L1 cache hit or a cache miss. For simplicity
 	 * we maintain encrypted data separately from compressed / uncompressed
 	 * data. If the user is requesting raw encrypted data and we don't have
 	 * that in the header we will read from disk to guarantee that we can
 	 * get it even if the encryption keys aren't loaded.
 	 */
 	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
 	    (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
 		boolean_t is_data = !HDR_ISTYPE_METADATA(hdr);
 
 		if (HDR_IO_IN_PROGRESS(hdr)) {
 			if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
 				mutex_exit(hash_lock);
 				ARCSTAT_BUMP(arcstat_cached_only_in_progress);
 				rc = SET_ERROR(ENOENT);
 				goto done;
 			}
 
 			zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
 			ASSERT3P(head_zio, !=, NULL);
 			if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
 			    priority == ZIO_PRIORITY_SYNC_READ) {
 				/*
 				 * This is a sync read that needs to wait for
 				 * an in-flight async read. Request that the
 				 * zio have its priority upgraded.
 				 */
 				zio_change_priority(head_zio, priority);
 				DTRACE_PROBE1(arc__async__upgrade__sync,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_async_upgrade_sync);
 			}
 
 			DTRACE_PROBE1(arc__iohit, arc_buf_hdr_t *, hdr);
 			arc_access(hdr, *arc_flags, B_FALSE);
 
 			/*
 			 * If there are multiple threads reading the same block
 			 * and that block is not yet in the ARC, then only one
 			 * thread will do the physical I/O and all other
 			 * threads will wait until that I/O completes.
 			 * Synchronous reads use the acb_wait_cv whereas nowait
 			 * reads register a callback. Both are signalled/called
 			 * in arc_read_done.
 			 *
 			 * Errors of the physical I/O may need to be propagated.
 			 * Synchronous read errors are returned here from
 			 * arc_read_done via acb_wait_error.  Nowait reads
 			 * attach the acb_zio_dummy zio to pio and
 			 * arc_read_done propagates the physical I/O's io_error
 			 * to acb_zio_dummy, and thereby to pio.
 			 */
 			arc_callback_t *acb = NULL;
 			if (done || pio || *arc_flags & ARC_FLAG_WAIT) {
 				acb = kmem_zalloc(sizeof (arc_callback_t),
 				    KM_SLEEP);
 				acb->acb_done = done;
 				acb->acb_private = private;
 				acb->acb_compressed = compressed_read;
 				acb->acb_encrypted = encrypted_read;
 				acb->acb_noauth = noauth_read;
 				acb->acb_nobuf = no_buf;
 				if (*arc_flags & ARC_FLAG_WAIT) {
 					acb->acb_wait = B_TRUE;
 					mutex_init(&acb->acb_wait_lock, NULL,
 					    MUTEX_DEFAULT, NULL);
 					cv_init(&acb->acb_wait_cv, NULL,
 					    CV_DEFAULT, NULL);
 				}
 				acb->acb_zb = *zb;
 				if (pio != NULL) {
 					acb->acb_zio_dummy = zio_null(pio,
 					    spa, NULL, NULL, NULL, zio_flags);
 				}
 				acb->acb_zio_head = head_zio;
 				acb->acb_next = hdr->b_l1hdr.b_acb;
 				hdr->b_l1hdr.b_acb->acb_prev = acb;
 				hdr->b_l1hdr.b_acb = acb;
 			}
 			mutex_exit(hash_lock);
 
 			ARCSTAT_BUMP(arcstat_iohits);
 			ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
 			    demand, prefetch, is_data, data, metadata, iohits);
 
 			if (*arc_flags & ARC_FLAG_WAIT) {
 				mutex_enter(&acb->acb_wait_lock);
 				while (acb->acb_wait) {
 					cv_wait(&acb->acb_wait_cv,
 					    &acb->acb_wait_lock);
 				}
 				rc = acb->acb_wait_error;
 				mutex_exit(&acb->acb_wait_lock);
 				mutex_destroy(&acb->acb_wait_lock);
 				cv_destroy(&acb->acb_wait_cv);
 				kmem_free(acb, sizeof (arc_callback_t));
 			}
 			goto out;
 		}
 
 		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
 		    hdr->b_l1hdr.b_state == arc_mfu ||
 		    hdr->b_l1hdr.b_state == arc_uncached);
 
 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 		arc_access(hdr, *arc_flags, B_TRUE);
 
 		if (done && !no_buf) {
 			ASSERT(!embedded_bp || !BP_IS_HOLE(bp));
 
 			/* Get a buf with the desired data in it. */
 			rc = arc_buf_alloc_impl(hdr, spa, zb, private,
 			    encrypted_read, compressed_read, noauth_read,
 			    B_TRUE, &buf);
 			if (rc == ECKSUM) {
 				/*
 				 * Convert authentication and decryption errors
 				 * to EIO (and generate an ereport if needed)
 				 * before leaving the ARC.
 				 */
 				rc = SET_ERROR(EIO);
 				if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
-					spa_log_error(spa, zb, &hdr->b_birth);
+					spa_log_error(spa, zb, hdr->b_birth);
 					(void) zfs_ereport_post(
 					    FM_EREPORT_ZFS_AUTHENTICATION,
 					    spa, NULL, zb, NULL, 0);
 				}
 			}
 			if (rc != 0) {
 				arc_buf_destroy_impl(buf);
 				buf = NULL;
 				(void) remove_reference(hdr, private);
 			}
 
 			/* assert any errors weren't due to unloaded keys */
 			ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
 			    rc != EACCES);
 		}
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_hits);
 		ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
 		    demand, prefetch, is_data, data, metadata, hits);
 		*arc_flags |= ARC_FLAG_CACHED;
 		goto done;
 	} else {
 		uint64_t lsize = BP_GET_LSIZE(bp);
 		uint64_t psize = BP_GET_PSIZE(bp);
 		arc_callback_t *acb;
 		vdev_t *vd = NULL;
 		uint64_t addr = 0;
 		boolean_t devw = B_FALSE;
 		uint64_t size;
 		abd_t *hdr_abd;
 		int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
 		arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
 
 		if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 			rc = SET_ERROR(ENOENT);
 			goto done;
 		}
 
 		if (hdr == NULL) {
 			/*
 			 * This block is not in the cache or it has
 			 * embedded data.
 			 */
 			arc_buf_hdr_t *exists = NULL;
-			hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
+			hdr = arc_hdr_alloc(guid, psize, lsize,
 			    BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type);
 
 			if (!embedded_bp) {
 				hdr->b_dva = *BP_IDENTITY(bp);
-				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
+				hdr->b_birth = BP_GET_BIRTH(bp);
 				exists = buf_hash_insert(hdr, &hash_lock);
 			}
 			if (exists != NULL) {
 				/* somebody beat us to the hash insert */
 				mutex_exit(hash_lock);
 				buf_discard_identity(hdr);
 				arc_hdr_destroy(hdr);
 				goto top; /* restart the IO request */
 			}
 		} else {
 			/*
 			 * This block is in the ghost cache or encrypted data
 			 * was requested and we didn't have it. If it was
 			 * L2-only (and thus didn't have an L1 hdr),
 			 * we realloc the header to add an L1 hdr.
 			 */
 			if (!HDR_HAS_L1HDR(hdr)) {
 				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
 				    hdr_full_cache);
 			}
 
 			if (GHOST_STATE(hdr->b_l1hdr.b_state)) {
 				ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 				ASSERT(!HDR_HAS_RABD(hdr));
 				ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 				ASSERT0(zfs_refcount_count(
 				    &hdr->b_l1hdr.b_refcnt));
 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 #ifdef ZFS_DEBUG
 				ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 #endif
 			} else if (HDR_IO_IN_PROGRESS(hdr)) {
 				/*
 				 * If this header already had an IO in progress
 				 * and we are performing another IO to fetch
 				 * encrypted data we must wait until the first
 				 * IO completes so as not to confuse
 				 * arc_read_done(). This should be very rare
 				 * and so the performance impact shouldn't
 				 * matter.
 				 */
 				arc_callback_t *acb = kmem_zalloc(
 				    sizeof (arc_callback_t), KM_SLEEP);
 				acb->acb_wait = B_TRUE;
 				mutex_init(&acb->acb_wait_lock, NULL,
 				    MUTEX_DEFAULT, NULL);
 				cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT,
 				    NULL);
 				acb->acb_zio_head =
 				    hdr->b_l1hdr.b_acb->acb_zio_head;
 				acb->acb_next = hdr->b_l1hdr.b_acb;
 				hdr->b_l1hdr.b_acb->acb_prev = acb;
 				hdr->b_l1hdr.b_acb = acb;
 				mutex_exit(hash_lock);
 				mutex_enter(&acb->acb_wait_lock);
 				while (acb->acb_wait) {
 					cv_wait(&acb->acb_wait_cv,
 					    &acb->acb_wait_lock);
 				}
 				mutex_exit(&acb->acb_wait_lock);
 				mutex_destroy(&acb->acb_wait_lock);
 				cv_destroy(&acb->acb_wait_cv);
 				kmem_free(acb, sizeof (arc_callback_t));
 				goto top;
 			}
 		}
 		if (*arc_flags & ARC_FLAG_UNCACHED) {
 			arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
 			if (!encrypted_read)
 				alloc_flags |= ARC_HDR_ALLOC_LINEAR;
 		}
 
 		/*
 		 * Take additional reference for IO_IN_PROGRESS.  It stops
 		 * arc_access() from putting this header without any buffers
 		 * and so other references but obviously nonevictable onto
 		 * the evictable list of MRU or MFU state.
 		 */
 		add_reference(hdr, hdr);
 		if (!embedded_bp)
 			arc_access(hdr, *arc_flags, B_FALSE);
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		arc_hdr_alloc_abd(hdr, alloc_flags);
 		if (encrypted_read) {
 			ASSERT(HDR_HAS_RABD(hdr));
 			size = HDR_GET_PSIZE(hdr);
 			hdr_abd = hdr->b_crypt_hdr.b_rabd;
 			zio_flags |= ZIO_FLAG_RAW;
 		} else {
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			size = arc_hdr_size(hdr);
 			hdr_abd = hdr->b_l1hdr.b_pabd;
 
 			if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
 				zio_flags |= ZIO_FLAG_RAW_COMPRESS;
 			}
 
 			/*
 			 * For authenticated bp's, we do not ask the ZIO layer
 			 * to authenticate them since this will cause the entire
 			 * IO to fail if the key isn't loaded. Instead, we
 			 * defer authentication until arc_buf_fill(), which will
 			 * verify the data when the key is available.
 			 */
 			if (BP_IS_AUTHENTICATED(bp))
 				zio_flags |= ZIO_FLAG_RAW_ENCRYPT;
 		}
 
 		if (BP_IS_AUTHENTICATED(bp))
 			arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
 		if (BP_GET_LEVEL(bp) > 0)
 			arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
 		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
 
 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
 		acb->acb_done = done;
 		acb->acb_private = private;
 		acb->acb_compressed = compressed_read;
 		acb->acb_encrypted = encrypted_read;
 		acb->acb_noauth = noauth_read;
 		acb->acb_zb = *zb;
 
 		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 		hdr->b_l1hdr.b_acb = acb;
 
 		if (HDR_HAS_L2HDR(hdr) &&
 		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
 			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
 			addr = hdr->b_l2hdr.b_daddr;
 			/*
 			 * Lock out L2ARC device removal.
 			 */
 			if (vdev_is_dead(vd) ||
 			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
 				vd = NULL;
 		}
 
 		/*
 		 * We count both async reads and scrub IOs as asynchronous so
 		 * that both can be upgraded in the event of a cache hit while
 		 * the read IO is still in-flight.
 		 */
 		if (priority == ZIO_PRIORITY_ASYNC_READ ||
 		    priority == ZIO_PRIORITY_SCRUB)
 			arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
 		else
 			arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
 
 		/*
 		 * At this point, we have a level 1 cache miss or a blkptr
 		 * with embedded data.  Try again in L2ARC if possible.
 		 */
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
 
 		/*
 		 * Skip ARC stat bump for block pointers with embedded
 		 * data. The data are read from the blkptr itself via
 		 * decode_embedded_bp_compressed().
 		 */
 		if (!embedded_bp) {
 			DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr,
 			    blkptr_t *, bp, uint64_t, lsize,
 			    zbookmark_phys_t *, zb);
 			ARCSTAT_BUMP(arcstat_misses);
 			ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
 			    demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
 			    metadata, misses);
 			zfs_racct_read(size, 1);
 		}
 
 		/* Check if the spa even has l2 configured */
 		const boolean_t spa_has_l2 = l2arc_ndev != 0 &&
 		    spa->spa_l2cache.sav_count > 0;
 
 		if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) {
 			/*
 			 * Read from the L2ARC if the following are true:
 			 * 1. The L2ARC vdev was previously cached.
 			 * 2. This buffer still has L2ARC metadata.
 			 * 3. This buffer isn't currently writing to the L2ARC.
 			 * 4. The L2ARC entry wasn't evicted, which may
 			 *    also have invalidated the vdev.
 			 */
 			if (HDR_HAS_L2HDR(hdr) &&
 			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
 				l2arc_read_callback_t *cb;
 				abd_t *abd;
 				uint64_t asize;
 
 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_hits);
 				hdr->b_l2hdr.b_hits++;
 
 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
 				    KM_SLEEP);
 				cb->l2rcb_hdr = hdr;
 				cb->l2rcb_bp = *bp;
 				cb->l2rcb_zb = *zb;
 				cb->l2rcb_flags = zio_flags;
 
 				/*
 				 * When Compressed ARC is disabled, but the
 				 * L2ARC block is compressed, arc_hdr_size()
 				 * will have returned LSIZE rather than PSIZE.
 				 */
 				if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 				    !HDR_COMPRESSION_ENABLED(hdr) &&
 				    HDR_GET_PSIZE(hdr) != 0) {
 					size = HDR_GET_PSIZE(hdr);
 				}
 
 				asize = vdev_psize_to_asize(vd, size);
 				if (asize != size) {
 					abd = abd_alloc_for_io(asize,
 					    HDR_ISTYPE_METADATA(hdr));
 					cb->l2rcb_abd = abd;
 				} else {
 					abd = hdr_abd;
 				}
 
 				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
 				    addr + asize <= vd->vdev_psize -
 				    VDEV_LABEL_END_SIZE);
 
 				/*
 				 * l2arc read.  The SCL_L2ARC lock will be
 				 * released by l2arc_read_done().
 				 * Issue a null zio if the underlying buffer
 				 * was squashed to zero size by compression.
 				 */
 				ASSERT3U(arc_hdr_get_compress(hdr), !=,
 				    ZIO_COMPRESS_EMPTY);
 				rzio = zio_read_phys(pio, vd, addr,
 				    asize, abd,
 				    ZIO_CHECKSUM_OFF,
 				    l2arc_read_done, cb, priority,
 				    zio_flags | ZIO_FLAG_CANFAIL |
 				    ZIO_FLAG_DONT_PROPAGATE |
 				    ZIO_FLAG_DONT_RETRY, B_FALSE);
 				acb->acb_zio_head = rzio;
 
 				if (hash_lock != NULL)
 					mutex_exit(hash_lock);
 
 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
 				    zio_t *, rzio);
 				ARCSTAT_INCR(arcstat_l2_read_bytes,
 				    HDR_GET_PSIZE(hdr));
 
 				if (*arc_flags & ARC_FLAG_NOWAIT) {
 					zio_nowait(rzio);
 					goto out;
 				}
 
 				ASSERT(*arc_flags & ARC_FLAG_WAIT);
 				if (zio_wait(rzio) == 0)
 					goto out;
 
 				/* l2arc read error; goto zio_read() */
 				if (hash_lock != NULL)
 					mutex_enter(hash_lock);
 			} else {
 				DTRACE_PROBE1(l2arc__miss,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_misses);
 				if (HDR_L2_WRITING(hdr))
 					ARCSTAT_BUMP(arcstat_l2_rw_clash);
 				spa_config_exit(spa, SCL_L2ARC, vd);
 			}
 		} else {
 			if (vd != NULL)
 				spa_config_exit(spa, SCL_L2ARC, vd);
 
 			/*
 			 * Only a spa with l2 should contribute to l2
 			 * miss stats.  (Including the case of having a
 			 * faulted cache device - that's also a miss.)
 			 */
 			if (spa_has_l2) {
 				/*
 				 * Skip ARC stat bump for block pointers with
 				 * embedded data. The data are read from the
 				 * blkptr itself via
 				 * decode_embedded_bp_compressed().
 				 */
 				if (!embedded_bp) {
 					DTRACE_PROBE1(l2arc__miss,
 					    arc_buf_hdr_t *, hdr);
 					ARCSTAT_BUMP(arcstat_l2_misses);
 				}
 			}
 		}
 
 		rzio = zio_read(pio, spa, bp, hdr_abd, size,
 		    arc_read_done, hdr, priority, zio_flags, zb);
 		acb->acb_zio_head = rzio;
 
 		if (hash_lock != NULL)
 			mutex_exit(hash_lock);
 
 		if (*arc_flags & ARC_FLAG_WAIT) {
 			rc = zio_wait(rzio);
 			goto out;
 		}
 
 		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
 		zio_nowait(rzio);
 	}
 
 out:
 	/* embedded bps don't actually go to disk */
 	if (!embedded_bp)
 		spa_read_history_add(spa, zb, *arc_flags);
 	spl_fstrans_unmark(cookie);
 	return (rc);
 
 done:
 	if (done)
 		done(NULL, zb, bp, buf, private);
 	if (pio && rc != 0) {
 		zio_t *zio = zio_null(pio, spa, NULL, NULL, NULL, zio_flags);
 		zio->io_error = rc;
 		zio_nowait(zio);
 	}
 	goto out;
 }
 
 arc_prune_t *
 arc_add_prune_callback(arc_prune_func_t *func, void *private)
 {
 	arc_prune_t *p;
 
 	p = kmem_alloc(sizeof (*p), KM_SLEEP);
 	p->p_pfunc = func;
 	p->p_private = private;
 	list_link_init(&p->p_node);
 	zfs_refcount_create(&p->p_refcnt);
 
 	mutex_enter(&arc_prune_mtx);
 	zfs_refcount_add(&p->p_refcnt, &arc_prune_list);
 	list_insert_head(&arc_prune_list, p);
 	mutex_exit(&arc_prune_mtx);
 
 	return (p);
 }
 
 void
 arc_remove_prune_callback(arc_prune_t *p)
 {
 	boolean_t wait = B_FALSE;
 	mutex_enter(&arc_prune_mtx);
 	list_remove(&arc_prune_list, p);
 	if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
 		wait = B_TRUE;
 	mutex_exit(&arc_prune_mtx);
 
 	/* wait for arc_prune_task to finish */
 	if (wait)
 		taskq_wait_outstanding(arc_prune_taskq, 0);
 	ASSERT0(zfs_refcount_count(&p->p_refcnt));
 	zfs_refcount_destroy(&p->p_refcnt);
 	kmem_free(p, sizeof (*p));
 }
 
 /*
  * Helper function for arc_prune_async() it is responsible for safely
  * handling the execution of a registered arc_prune_func_t.
  */
 static void
 arc_prune_task(void *ptr)
 {
 	arc_prune_t *ap = (arc_prune_t *)ptr;
 	arc_prune_func_t *func = ap->p_pfunc;
 
 	if (func != NULL)
 		func(ap->p_adjust, ap->p_private);
 
 	(void) zfs_refcount_remove(&ap->p_refcnt, func);
 }
 
 /*
  * Notify registered consumers they must drop holds on a portion of the ARC
  * buffers they reference.  This provides a mechanism to ensure the ARC can
  * honor the metadata limit and reclaim otherwise pinned ARC buffers.
  *
  * This operation is performed asynchronously so it may be safely called
  * in the context of the arc_reclaim_thread().  A reference is taken here
  * for each registered arc_prune_t and the arc_prune_task() is responsible
  * for releasing it once the registered arc_prune_func_t has completed.
  */
 static void
 arc_prune_async(uint64_t adjust)
 {
 	arc_prune_t *ap;
 
 	mutex_enter(&arc_prune_mtx);
 	for (ap = list_head(&arc_prune_list); ap != NULL;
 	    ap = list_next(&arc_prune_list, ap)) {
 
 		if (zfs_refcount_count(&ap->p_refcnt) >= 2)
 			continue;
 
 		zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
 		ap->p_adjust = adjust;
 		if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
 		    ap, TQ_SLEEP) == TASKQID_INVALID) {
 			(void) zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
 			continue;
 		}
 		ARCSTAT_BUMP(arcstat_prune);
 	}
 	mutex_exit(&arc_prune_mtx);
 }
 
 /*
  * Notify the arc that a block was freed, and thus will never be used again.
  */
 void
 arc_freed(spa_t *spa, const blkptr_t *bp)
 {
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	uint64_t guid = spa_load_guid(spa);
 
 	ASSERT(!BP_IS_EMBEDDED(bp));
 
 	hdr = buf_hash_find(guid, bp, &hash_lock);
 	if (hdr == NULL)
 		return;
 
 	/*
 	 * We might be trying to free a block that is still doing I/O
 	 * (i.e. prefetch) or has some other reference (i.e. a dedup-ed,
 	 * dmu_sync-ed block). A block may also have a reference if it is
 	 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
 	 * have written the new block to its final resting place on disk but
 	 * without the dedup flag set. This would have left the hdr in the MRU
 	 * state and discoverable. When the txg finally syncs it detects that
 	 * the block was overridden in open context and issues an override I/O.
 	 * Since this is a dedup block, the override I/O will determine if the
 	 * block is already in the DDT. If so, then it will replace the io_bp
 	 * with the bp from the DDT and allow the I/O to finish. When the I/O
 	 * reaches the done callback, dbuf_write_override_done, it will
 	 * check to see if the io_bp and io_bp_override are identical.
 	 * If they are not, then it indicates that the bp was replaced with
 	 * the bp in the DDT and the override bp is freed. This allows
 	 * us to arrive here with a reference on a block that is being
 	 * freed. So if we have an I/O in progress, or a reference to
 	 * this hdr, then we don't destroy the hdr.
 	 */
 	if (!HDR_HAS_L1HDR(hdr) ||
 	    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
 		arc_change_state(arc_anon, hdr);
 		arc_hdr_destroy(hdr);
 		mutex_exit(hash_lock);
 	} else {
 		mutex_exit(hash_lock);
 	}
 
 }
 
 /*
  * Release this buffer from the cache, making it an anonymous buffer.  This
  * must be done after a read and prior to modifying the buffer contents.
  * If the buffer has more than one reference, we must make
  * a new hdr for the buffer.
  */
 void
 arc_release(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * It would be nice to assert that if its DMU metadata (level >
 	 * 0 || it's the dnode file), then it must be syncing context.
 	 * But we don't know that information at this level.
 	 */
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	/*
 	 * We don't grab the hash lock prior to this check, because if
 	 * the buffer's header is in the arc_anon state, it won't be
 	 * linked into the hash table.
 	 */
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
 		ASSERT(!HDR_HAS_L2HDR(hdr));
 
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
 		ASSERT(ARC_BUF_LAST(buf));
 		ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 
 		hdr->b_l1hdr.b_arc_access = 0;
 
 		/*
 		 * If the buf is being overridden then it may already
 		 * have a hdr that is not empty.
 		 */
 		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 
 		return;
 	}
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	/*
 	 * This assignment is only valid as long as the hash_lock is
 	 * held, we must be careful not to reference state or the
 	 * b_state field after dropping the lock.
 	 */
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT3P(state, !=, arc_anon);
 
 	/* this buffer is not on any list */
 	ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
 
 		/*
 		 * We have to recheck this conditional again now that
 		 * we're holding the l2ad_mtx to prevent a race with
 		 * another thread which might be concurrently calling
 		 * l2arc_evict(). In that case, l2arc_evict() might have
 		 * destroyed the header's L2 portion as we were waiting
 		 * to acquire the l2ad_mtx.
 		 */
 		if (HDR_HAS_L2HDR(hdr))
 			arc_hdr_l2hdr_destroy(hdr);
 
 		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
 	}
 
 	/*
 	 * Do we have more than one buf?
 	 */
 	if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf)) {
 		arc_buf_hdr_t *nhdr;
 		uint64_t spa = hdr->b_spa;
 		uint64_t psize = HDR_GET_PSIZE(hdr);
 		uint64_t lsize = HDR_GET_LSIZE(hdr);
 		boolean_t protected = HDR_PROTECTED(hdr);
 		enum zio_compress compress = arc_hdr_get_compress(hdr);
 		arc_buf_contents_t type = arc_buf_type(hdr);
 		VERIFY3U(hdr->b_type, ==, type);
 
 		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
 		VERIFY3S(remove_reference(hdr, tag), >, 0);
 
 		if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
 			ASSERT(ARC_BUF_LAST(buf));
 		}
 
 		/*
 		 * Pull the data off of this hdr and attach it to
 		 * a new anonymous hdr. Also find the last buffer
 		 * in the hdr's buffer list.
 		 */
 		arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
 		ASSERT3P(lastbuf, !=, NULL);
 
 		/*
 		 * If the current arc_buf_t and the hdr are sharing their data
 		 * buffer, then we must stop sharing that block.
 		 */
 		if (ARC_BUF_SHARED(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
 			ASSERT(!arc_buf_is_shared(lastbuf));
 
 			/*
 			 * First, sever the block sharing relationship between
 			 * buf and the arc_buf_hdr_t.
 			 */
 			arc_unshare_buf(hdr, buf);
 
 			/*
 			 * Now we need to recreate the hdr's b_pabd. Since we
 			 * have lastbuf handy, we try to share with it, but if
 			 * we can't then we allocate a new b_pabd and copy the
 			 * data from buf into it.
 			 */
 			if (arc_can_share(hdr, lastbuf)) {
 				arc_share_buf(hdr, lastbuf);
 			} else {
 				arc_hdr_alloc_abd(hdr, 0);
 				abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
 				    buf->b_data, psize);
 			}
 			VERIFY3P(lastbuf->b_data, !=, NULL);
 		} else if (HDR_SHARED_DATA(hdr)) {
 			/*
 			 * Uncompressed shared buffers are always at the end
 			 * of the list. Compressed buffers don't have the
 			 * same requirements. This makes it hard to
 			 * simply assert that the lastbuf is shared so
 			 * we rely on the hdr's compression flags to determine
 			 * if we have a compressed, shared buffer.
 			 */
 			ASSERT(arc_buf_is_shared(lastbuf) ||
 			    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 			ASSERT(!arc_buf_is_shared(buf));
 		}
 
 		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 		ASSERT3P(state, !=, arc_l2c_only);
 
 		(void) zfs_refcount_remove_many(&state->arcs_size[type],
 		    arc_buf_size(buf), buf);
 
 		if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
 			ASSERT3P(state, !=, arc_l2c_only);
 			(void) zfs_refcount_remove_many(
 			    &state->arcs_esize[type],
 			    arc_buf_size(buf), buf);
 		}
 
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
 
 		/* if this is the last uncompressed buf free the checksum */
 		if (!arc_hdr_has_uncompressed_buf(hdr))
 			arc_cksum_free(hdr);
 
 		mutex_exit(hash_lock);
 
 		nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
 		    compress, hdr->b_complevel, type);
 		ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
 		VERIFY3U(nhdr->b_type, ==, type);
 		ASSERT(!HDR_SHARED_DATA(nhdr));
 
 		nhdr->b_l1hdr.b_buf = buf;
 		(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
 		buf->b_hdr = nhdr;
 
 		(void) zfs_refcount_add_many(&arc_anon->arcs_size[type],
 		    arc_buf_size(buf), buf);
 	} else {
 		ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
 		/* protected by hash lock, or hdr is on arc_anon */
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		hdr->b_l1hdr.b_mru_hits = 0;
 		hdr->b_l1hdr.b_mru_ghost_hits = 0;
 		hdr->b_l1hdr.b_mfu_hits = 0;
 		hdr->b_l1hdr.b_mfu_ghost_hits = 0;
 		arc_change_state(arc_anon, hdr);
 		hdr->b_l1hdr.b_arc_access = 0;
 
 		mutex_exit(hash_lock);
 		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 	}
 }
 
 int
 arc_released(arc_buf_t *buf)
 {
 	return (buf->b_data != NULL &&
 	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
 }
 
 #ifdef ZFS_DEBUG
 int
 arc_referenced(arc_buf_t *buf)
 {
 	return (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
 }
 #endif
 
 static void
 arc_write_ready(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp);
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
 	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 
 	/*
 	 * If we're reexecuting this zio because the pool suspended, then
 	 * cleanup any state that was previously set the first time the
 	 * callback was invoked.
 	 */
 	if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
 		arc_cksum_free(hdr);
 		arc_buf_unwatch(buf);
 		if (hdr->b_l1hdr.b_pabd != NULL) {
 			if (ARC_BUF_SHARED(buf)) {
 				arc_unshare_buf(hdr, buf);
 			} else {
 				ASSERT(!arc_buf_is_shared(buf));
 				arc_hdr_free_abd(hdr, B_FALSE);
 			}
 		}
 
 		if (HDR_HAS_RABD(hdr))
 			arc_hdr_free_abd(hdr, B_TRUE);
 	}
 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 	ASSERT(!HDR_HAS_RABD(hdr));
 	ASSERT(!HDR_SHARED_DATA(hdr));
 	ASSERT(!arc_buf_is_shared(buf));
 
 	callback->awcb_ready(zio, buf, callback->awcb_private);
 
 	if (HDR_IO_IN_PROGRESS(hdr)) {
 		ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */
 	}
 
 	if (BP_IS_PROTECTED(bp)) {
 		/* ZIL blocks are written through zio_rewrite */
 		ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
 
 		if (BP_SHOULD_BYTESWAP(bp)) {
 			if (BP_GET_LEVEL(bp) > 0) {
 				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
 			} else {
 				hdr->b_l1hdr.b_byteswap =
 				    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
 			}
 		} else {
 			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 		}
 
 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
 		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
 		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv);
 		zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
 	} else {
 		arc_hdr_clear_flags(hdr, ARC_FLAG_PROTECTED);
 	}
 
 	/*
 	 * If this block was written for raw encryption but the zio layer
 	 * ended up only authenticating it, adjust the buffer flags now.
 	 */
 	if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) {
 		arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
 		buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 		if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF)
 			buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 	} else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) {
 		buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 	}
 
 	/* this must be done after the buffer flags are adjusted */
 	arc_cksum_compute(buf);
 
 	enum zio_compress compress;
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
 		compress = ZIO_COMPRESS_OFF;
 	} else {
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
 		compress = BP_GET_COMPRESS(bp);
 	}
 	HDR_SET_PSIZE(hdr, psize);
 	arc_hdr_set_compress(hdr, compress);
 	hdr->b_complevel = zio->io_prop.zp_complevel;
 
 	if (zio->io_error != 0 || psize == 0)
 		goto out;
 
 	/*
 	 * Fill the hdr with data. If the buffer is encrypted we have no choice
 	 * but to copy the data into b_radb. If the hdr is compressed, the data
 	 * we want is available from the zio, otherwise we can take it from
 	 * the buf.
 	 *
 	 * We might be able to share the buf's data with the hdr here. However,
 	 * doing so would cause the ARC to be full of linear ABDs if we write a
 	 * lot of shareable data. As a compromise, we check whether scattered
 	 * ABDs are allowed, and assume that if they are then the user wants
 	 * the ARC to be primarily filled with them regardless of the data being
 	 * written. Therefore, if they're allowed then we allocate one and copy
 	 * the data into it; otherwise, we share the data directly if we can.
 	 */
 	if (ARC_BUF_ENCRYPTED(buf)) {
 		ASSERT3U(psize, >, 0);
 		ASSERT(ARC_BUF_COMPRESSED(buf));
 		arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA |
 		    ARC_HDR_USE_RESERVE);
 		abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
 	} else if (!(HDR_UNCACHED(hdr) ||
 	    abd_size_alloc_linear(arc_buf_size(buf))) ||
 	    !arc_can_share(hdr, buf)) {
 		/*
 		 * Ideally, we would always copy the io_abd into b_pabd, but the
 		 * user may have disabled compressed ARC, thus we must check the
 		 * hdr's compression setting rather than the io_bp's.
 		 */
 		if (BP_IS_ENCRYPTED(bp)) {
 			ASSERT3U(psize, >, 0);
 			arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA |
 			    ARC_HDR_USE_RESERVE);
 			abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
 		} else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
 		    !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3U(psize, >, 0);
 			arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE);
 			abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
 		} else {
 			ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
 			arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE);
 			abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
 			    arc_buf_size(buf));
 		}
 	} else {
 		ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
 		ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
 		ASSERT(ARC_BUF_LAST(buf));
 
 		arc_share_buf(hdr, buf);
 	}
 
 out:
 	arc_hdr_verify(hdr, bp);
 	spl_fstrans_unmark(cookie);
 }
 
 static void
 arc_write_children_ready(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 
 	callback->awcb_children_ready(zio, buf, callback->awcb_private);
 }
 
 static void
 arc_write_done(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 
 	if (zio->io_error == 0) {
 		arc_hdr_verify(hdr, zio->io_bp);
 
 		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
 			buf_discard_identity(hdr);
 		} else {
 			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
-			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+			hdr->b_birth = BP_GET_BIRTH(zio->io_bp);
 		}
 	} else {
 		ASSERT(HDR_EMPTY(hdr));
 	}
 
 	/*
 	 * If the block to be written was all-zero or compressed enough to be
 	 * embedded in the BP, no write was performed so there will be no
 	 * dva/birth/checksum.  The buffer must therefore remain anonymous
 	 * (and uncached).
 	 */
 	if (!HDR_EMPTY(hdr)) {
 		arc_buf_hdr_t *exists;
 		kmutex_t *hash_lock;
 
 		ASSERT3U(zio->io_error, ==, 0);
 
 		arc_cksum_verify(buf);
 
 		exists = buf_hash_insert(hdr, &hash_lock);
 		if (exists != NULL) {
 			/*
 			 * This can only happen if we overwrite for
 			 * sync-to-convergence, because we remove
 			 * buffers from the hash table when we arc_free().
 			 */
 			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad overwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
 				ASSERT(zfs_refcount_is_zero(
 				    &exists->b_l1hdr.b_refcnt));
 				arc_change_state(arc_anon, exists);
 				arc_hdr_destroy(exists);
 				mutex_exit(hash_lock);
 				exists = buf_hash_insert(hdr, &hash_lock);
 				ASSERT3P(exists, ==, NULL);
 			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
 				/* nopwrite */
 				ASSERT(zio->io_prop.zp_nopwrite);
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad nopwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
 			} else {
 				/* Dedup */
 				ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 				ASSERT(ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
 				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
 				ASSERT(BP_GET_DEDUP(zio->io_bp));
 				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
 			}
 		}
 		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		VERIFY3S(remove_reference(hdr, hdr), >, 0);
 		/* if it's not anon, we are doing a scrub */
 		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
 			arc_access(hdr, 0, B_FALSE);
 		mutex_exit(hash_lock);
 	} else {
 		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		VERIFY3S(remove_reference(hdr, hdr), >, 0);
 	}
 
 	callback->awcb_done(zio, buf, callback->awcb_private);
 
 	abd_free(zio->io_abd);
 	kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
 zio_t *
 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
     blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc,
     const zio_prop_t *zp, arc_write_done_func_t *ready,
     arc_write_done_func_t *children_ready, arc_write_done_func_t *done,
     void *private, zio_priority_t priority, int zio_flags,
     const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
 	zio_t *zio;
 	zio_prop_t localprop = *zp;
 
 	ASSERT3P(ready, !=, NULL);
 	ASSERT3P(done, !=, NULL);
 	ASSERT(!HDR_IO_ERROR(hdr));
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 	if (uncached)
 		arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
 	else if (l2arc)
 		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 
 	if (ARC_BUF_ENCRYPTED(buf)) {
 		ASSERT(ARC_BUF_COMPRESSED(buf));
 		localprop.zp_encrypt = B_TRUE;
 		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
 		localprop.zp_complevel = hdr->b_complevel;
 		localprop.zp_byteorder =
 		    (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
 		    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
 		memcpy(localprop.zp_salt, hdr->b_crypt_hdr.b_salt,
 		    ZIO_DATA_SALT_LEN);
 		memcpy(localprop.zp_iv, hdr->b_crypt_hdr.b_iv,
 		    ZIO_DATA_IV_LEN);
 		memcpy(localprop.zp_mac, hdr->b_crypt_hdr.b_mac,
 		    ZIO_DATA_MAC_LEN);
 		if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) {
 			localprop.zp_nopwrite = B_FALSE;
 			localprop.zp_copies =
 			    MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
 		}
 		zio_flags |= ZIO_FLAG_RAW;
 	} else if (ARC_BUF_COMPRESSED(buf)) {
 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
 		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
 		localprop.zp_complevel = hdr->b_complevel;
 		zio_flags |= ZIO_FLAG_RAW_COMPRESS;
 	}
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
 	callback->awcb_ready = ready;
 	callback->awcb_children_ready = children_ready;
 	callback->awcb_done = done;
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
 
 	/*
 	 * The hdr's b_pabd is now stale, free it now. A new data block
 	 * will be allocated when the zio pipeline calls arc_write_ready().
 	 */
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		/*
 		 * If the buf is currently sharing the data block with
 		 * the hdr then we need to break that relationship here.
 		 * The hdr will remain with a NULL data pointer and the
 		 * buf will take sole ownership of the block.
 		 */
 		if (ARC_BUF_SHARED(buf)) {
 			arc_unshare_buf(hdr, buf);
 		} else {
 			ASSERT(!arc_buf_is_shared(buf));
 			arc_hdr_free_abd(hdr, B_FALSE);
 		}
 		VERIFY3P(buf->b_data, !=, NULL);
 	}
 
 	if (HDR_HAS_RABD(hdr))
 		arc_hdr_free_abd(hdr, B_TRUE);
 
 	if (!(zio_flags & ZIO_FLAG_RAW))
 		arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
 
 	ASSERT(!arc_buf_is_shared(buf));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 
 	zio = zio_write(pio, spa, txg, bp,
 	    abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
 	    HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
 	    (children_ready != NULL) ? arc_write_children_ready : NULL,
 	    arc_write_done, callback, priority, zio_flags, zb);
 
 	return (zio);
 }
 
 void
 arc_tempreserve_clear(uint64_t reserve)
 {
 	atomic_add_64(&arc_tempreserve, -reserve);
 	ASSERT((int64_t)arc_tempreserve >= 0);
 }
 
 int
 arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
 {
 	int error;
 	uint64_t anon_size;
 
 	if (!arc_no_grow &&
 	    reserve > arc_c/4 &&
 	    reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT))
 		arc_c = MIN(arc_c_max, reserve * 4);
 
 	/*
 	 * Throttle when the calculated memory footprint for the TXG
 	 * exceeds the target ARC size.
 	 */
 	if (reserve > arc_c) {
 		DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
 		return (SET_ERROR(ERESTART));
 	}
 
 	/*
 	 * Don't count loaned bufs as in flight dirty data to prevent long
 	 * network delays from blocking transactions that are ready to be
 	 * assigned to a txg.
 	 */
 
 	/* assert that it has not wrapped around */
 	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
 
 	anon_size = MAX((int64_t)
 	    (zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]) +
 	    zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]) -
 	    arc_loaned_bytes), 0);
 
 	/*
 	 * Writes will, almost always, require additional memory allocations
 	 * in order to compress/encrypt/etc the data.  We therefore need to
 	 * make sure that there is sufficient available memory for this.
 	 */
 	error = arc_memory_throttle(spa, reserve, txg);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Throttle writes when the amount of dirty data in the cache
 	 * gets too large.  We try to keep the cache less than half full
 	 * of dirty blocks so that our sync times don't grow too large.
 	 *
 	 * In the case of one pool being built on another pool, we want
 	 * to make sure we don't end up throttling the lower (backing)
 	 * pool when the upper pool is the majority contributor to dirty
 	 * data. To insure we make forward progress during throttling, we
 	 * also check the current pool's net dirty data and only throttle
 	 * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
 	 * data in the cache.
 	 *
 	 * Note: if two requests come in concurrently, we might let them
 	 * both succeed, when one of them should fail.  Not a huge deal.
 	 */
 	uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
 	uint64_t spa_dirty_anon = spa_dirty_data(spa);
 	uint64_t rarc_c = arc_warm ? arc_c : arc_c_max;
 	if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 &&
 	    anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 &&
 	    spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
 #ifdef ZFS_DEBUG
 		uint64_t meta_esize = zfs_refcount_count(
 		    &arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 		uint64_t data_esize =
 		    zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
 		    "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n",
 		    (u_longlong_t)arc_tempreserve >> 10,
 		    (u_longlong_t)meta_esize >> 10,
 		    (u_longlong_t)data_esize >> 10,
 		    (u_longlong_t)reserve >> 10,
 		    (u_longlong_t)rarc_c >> 10);
 #endif
 		DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
 		return (SET_ERROR(ERESTART));
 	}
 	atomic_add_64(&arc_tempreserve, reserve);
 	return (0);
 }
 
 static void
 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
     kstat_named_t *data, kstat_named_t *metadata,
     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
 {
 	data->value.ui64 =
 	    zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]);
 	metadata->value.ui64 =
 	    zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]);
 	size->value.ui64 = data->value.ui64 + metadata->value.ui64;
 	evict_data->value.ui64 =
 	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
 	evict_metadata->value.ui64 =
 	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
 }
 
 static int
 arc_kstat_update(kstat_t *ksp, int rw)
 {
 	arc_stats_t *as = ksp->ks_data;
 
 	if (rw == KSTAT_WRITE)
 		return (SET_ERROR(EACCES));
 
 	as->arcstat_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hits);
 	as->arcstat_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_iohits);
 	as->arcstat_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_misses);
 	as->arcstat_demand_data_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_data_hits);
 	as->arcstat_demand_data_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_data_iohits);
 	as->arcstat_demand_data_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_data_misses);
 	as->arcstat_demand_metadata_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_metadata_hits);
 	as->arcstat_demand_metadata_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_metadata_iohits);
 	as->arcstat_demand_metadata_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_metadata_misses);
 	as->arcstat_prefetch_data_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_data_hits);
 	as->arcstat_prefetch_data_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_data_iohits);
 	as->arcstat_prefetch_data_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_data_misses);
 	as->arcstat_prefetch_metadata_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits);
 	as->arcstat_prefetch_metadata_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_metadata_iohits);
 	as->arcstat_prefetch_metadata_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses);
 	as->arcstat_mru_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mru_hits);
 	as->arcstat_mru_ghost_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mru_ghost_hits);
 	as->arcstat_mfu_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mfu_hits);
 	as->arcstat_mfu_ghost_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mfu_ghost_hits);
 	as->arcstat_uncached_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_uncached_hits);
 	as->arcstat_deleted.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_deleted);
 	as->arcstat_mutex_miss.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mutex_miss);
 	as->arcstat_access_skip.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_access_skip);
 	as->arcstat_evict_skip.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_skip);
 	as->arcstat_evict_not_enough.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_not_enough);
 	as->arcstat_evict_l2_cached.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_cached);
 	as->arcstat_evict_l2_eligible.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_eligible);
 	as->arcstat_evict_l2_eligible_mfu.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mfu);
 	as->arcstat_evict_l2_eligible_mru.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mru);
 	as->arcstat_evict_l2_ineligible.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_ineligible);
 	as->arcstat_evict_l2_skip.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_skip);
 	as->arcstat_hash_collisions.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hash_collisions);
 	as->arcstat_hash_chains.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hash_chains);
 	as->arcstat_size.value.ui64 =
 	    aggsum_value(&arc_sums.arcstat_size);
 	as->arcstat_compressed_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_compressed_size);
 	as->arcstat_uncompressed_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_uncompressed_size);
 	as->arcstat_overhead_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_overhead_size);
 	as->arcstat_hdr_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hdr_size);
 	as->arcstat_data_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_data_size);
 	as->arcstat_metadata_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_metadata_size);
 	as->arcstat_dbuf_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_dbuf_size);
 #if defined(COMPAT_FREEBSD11)
 	as->arcstat_other_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_bonus_size) +
 	    wmsum_value(&arc_sums.arcstat_dnode_size) +
 	    wmsum_value(&arc_sums.arcstat_dbuf_size);
 #endif
 
 	arc_kstat_update_state(arc_anon,
 	    &as->arcstat_anon_size,
 	    &as->arcstat_anon_data,
 	    &as->arcstat_anon_metadata,
 	    &as->arcstat_anon_evictable_data,
 	    &as->arcstat_anon_evictable_metadata);
 	arc_kstat_update_state(arc_mru,
 	    &as->arcstat_mru_size,
 	    &as->arcstat_mru_data,
 	    &as->arcstat_mru_metadata,
 	    &as->arcstat_mru_evictable_data,
 	    &as->arcstat_mru_evictable_metadata);
 	arc_kstat_update_state(arc_mru_ghost,
 	    &as->arcstat_mru_ghost_size,
 	    &as->arcstat_mru_ghost_data,
 	    &as->arcstat_mru_ghost_metadata,
 	    &as->arcstat_mru_ghost_evictable_data,
 	    &as->arcstat_mru_ghost_evictable_metadata);
 	arc_kstat_update_state(arc_mfu,
 	    &as->arcstat_mfu_size,
 	    &as->arcstat_mfu_data,
 	    &as->arcstat_mfu_metadata,
 	    &as->arcstat_mfu_evictable_data,
 	    &as->arcstat_mfu_evictable_metadata);
 	arc_kstat_update_state(arc_mfu_ghost,
 	    &as->arcstat_mfu_ghost_size,
 	    &as->arcstat_mfu_ghost_data,
 	    &as->arcstat_mfu_ghost_metadata,
 	    &as->arcstat_mfu_ghost_evictable_data,
 	    &as->arcstat_mfu_ghost_evictable_metadata);
 	arc_kstat_update_state(arc_uncached,
 	    &as->arcstat_uncached_size,
 	    &as->arcstat_uncached_data,
 	    &as->arcstat_uncached_metadata,
 	    &as->arcstat_uncached_evictable_data,
 	    &as->arcstat_uncached_evictable_metadata);
 
 	as->arcstat_dnode_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_dnode_size);
 	as->arcstat_bonus_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_bonus_size);
 	as->arcstat_l2_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_hits);
 	as->arcstat_l2_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_misses);
 	as->arcstat_l2_prefetch_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_prefetch_asize);
 	as->arcstat_l2_mru_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_mru_asize);
 	as->arcstat_l2_mfu_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_mfu_asize);
 	as->arcstat_l2_bufc_data_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_bufc_data_asize);
 	as->arcstat_l2_bufc_metadata_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_bufc_metadata_asize);
 	as->arcstat_l2_feeds.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_feeds);
 	as->arcstat_l2_rw_clash.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rw_clash);
 	as->arcstat_l2_read_bytes.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_read_bytes);
 	as->arcstat_l2_write_bytes.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_write_bytes);
 	as->arcstat_l2_writes_sent.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_sent);
 	as->arcstat_l2_writes_done.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_done);
 	as->arcstat_l2_writes_error.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_error);
 	as->arcstat_l2_writes_lock_retry.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_lock_retry);
 	as->arcstat_l2_evict_lock_retry.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_evict_lock_retry);
 	as->arcstat_l2_evict_reading.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_evict_reading);
 	as->arcstat_l2_evict_l1cached.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_evict_l1cached);
 	as->arcstat_l2_free_on_write.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_free_on_write);
 	as->arcstat_l2_abort_lowmem.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_abort_lowmem);
 	as->arcstat_l2_cksum_bad.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_cksum_bad);
 	as->arcstat_l2_io_error.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_io_error);
 	as->arcstat_l2_lsize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_lsize);
 	as->arcstat_l2_psize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_psize);
 	as->arcstat_l2_hdr_size.value.ui64 =
 	    aggsum_value(&arc_sums.arcstat_l2_hdr_size);
 	as->arcstat_l2_log_blk_writes.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_log_blk_writes);
 	as->arcstat_l2_log_blk_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_log_blk_asize);
 	as->arcstat_l2_log_blk_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_log_blk_count);
 	as->arcstat_l2_rebuild_success.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_success);
 	as->arcstat_l2_rebuild_abort_unsupported.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_unsupported);
 	as->arcstat_l2_rebuild_abort_io_errors.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_io_errors);
 	as->arcstat_l2_rebuild_abort_dh_errors.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_dh_errors);
 	as->arcstat_l2_rebuild_abort_cksum_lb_errors.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors);
 	as->arcstat_l2_rebuild_abort_lowmem.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_lowmem);
 	as->arcstat_l2_rebuild_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_size);
 	as->arcstat_l2_rebuild_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_asize);
 	as->arcstat_l2_rebuild_bufs.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs);
 	as->arcstat_l2_rebuild_bufs_precached.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs_precached);
 	as->arcstat_l2_rebuild_log_blks.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_log_blks);
 	as->arcstat_memory_throttle_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_memory_throttle_count);
 	as->arcstat_memory_direct_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_memory_direct_count);
 	as->arcstat_memory_indirect_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_memory_indirect_count);
 
 	as->arcstat_memory_all_bytes.value.ui64 =
 	    arc_all_memory();
 	as->arcstat_memory_free_bytes.value.ui64 =
 	    arc_free_memory();
 	as->arcstat_memory_available_bytes.value.i64 =
 	    arc_available_memory();
 
 	as->arcstat_prune.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prune);
 	as->arcstat_meta_used.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_meta_used);
 	as->arcstat_async_upgrade_sync.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_async_upgrade_sync);
 	as->arcstat_predictive_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_predictive_prefetch);
 	as->arcstat_demand_hit_predictive_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch);
 	as->arcstat_demand_iohit_predictive_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_iohit_predictive_prefetch);
 	as->arcstat_prescient_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prescient_prefetch);
 	as->arcstat_demand_hit_prescient_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch);
 	as->arcstat_demand_iohit_prescient_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_iohit_prescient_prefetch);
 	as->arcstat_raw_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_raw_size);
 	as->arcstat_cached_only_in_progress.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_cached_only_in_progress);
 	as->arcstat_abd_chunk_waste_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_abd_chunk_waste_size);
 
 	return (0);
 }
 
 /*
  * This function *must* return indices evenly distributed between all
  * sublists of the multilist. This is needed due to how the ARC eviction
  * code is laid out; arc_evict_state() assumes ARC buffers are evenly
  * distributed between all sublists and uses this assumption when
  * deciding which sublist to evict from and how much to evict from it.
  */
 static unsigned int
 arc_state_multilist_index_func(multilist_t *ml, void *obj)
 {
 	arc_buf_hdr_t *hdr = obj;
 
 	/*
 	 * We rely on b_dva to generate evenly distributed index
 	 * numbers using buf_hash below. So, as an added precaution,
 	 * let's make sure we never add empty buffers to the arc lists.
 	 */
 	ASSERT(!HDR_EMPTY(hdr));
 
 	/*
 	 * The assumption here, is the hash value for a given
 	 * arc_buf_hdr_t will remain constant throughout its lifetime
 	 * (i.e. its b_spa, b_dva, and b_birth fields don't change).
 	 * Thus, we don't need to store the header's sublist index
 	 * on insertion, as this index can be recalculated on removal.
 	 *
 	 * Also, the low order bits of the hash value are thought to be
 	 * distributed evenly. Otherwise, in the case that the multilist
 	 * has a power of two number of sublists, each sublists' usage
 	 * would not be evenly distributed. In this context full 64bit
 	 * division would be a waste of time, so limit it to 32 bits.
 	 */
 	return ((unsigned int)buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
 	    multilist_get_num_sublists(ml));
 }
 
 static unsigned int
 arc_state_l2c_multilist_index_func(multilist_t *ml, void *obj)
 {
 	panic("Header %p insert into arc_l2c_only %p", obj, ml);
 }
 
 #define	WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do {	\
 	if ((do_warn) && (tuning) && ((tuning) != (value))) {	\
 		cmn_err(CE_WARN,				\
 		    "ignoring tunable %s (using %llu instead)",	\
 		    (#tuning), (u_longlong_t)(value));	\
 	}							\
 } while (0)
 
 /*
  * Called during module initialization and periodically thereafter to
  * apply reasonable changes to the exposed performance tunings.  Can also be
  * called explicitly by param_set_arc_*() functions when ARC tunables are
  * updated manually.  Non-zero zfs_* values which differ from the currently set
  * values will be applied.
  */
 void
 arc_tuning_update(boolean_t verbose)
 {
 	uint64_t allmem = arc_all_memory();
 
 	/* Valid range: 32M - <arc_c_max> */
 	if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
 	    (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
 	    (zfs_arc_min <= arc_c_max)) {
 		arc_c_min = zfs_arc_min;
 		arc_c = MAX(arc_c, arc_c_min);
 	}
 	WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose);
 
 	/* Valid range: 64M - <all physical memory> */
 	if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
 	    (zfs_arc_max >= MIN_ARC_MAX) && (zfs_arc_max < allmem) &&
 	    (zfs_arc_max > arc_c_min)) {
 		arc_c_max = zfs_arc_max;
 		arc_c = MIN(arc_c, arc_c_max);
 		if (arc_dnode_limit > arc_c_max)
 			arc_dnode_limit = arc_c_max;
 	}
 	WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose);
 
 	/* Valid range: 0 - <all physical memory> */
 	arc_dnode_limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit :
 	    MIN(zfs_arc_dnode_limit_percent, 100) * arc_c_max / 100;
 	WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_limit, verbose);
 
 	/* Valid range: 1 - N */
 	if (zfs_arc_grow_retry)
 		arc_grow_retry = zfs_arc_grow_retry;
 
 	/* Valid range: 1 - N */
 	if (zfs_arc_shrink_shift) {
 		arc_shrink_shift = zfs_arc_shrink_shift;
 		arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
 	}
 
 	/* Valid range: 1 - N ms */
 	if (zfs_arc_min_prefetch_ms)
 		arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
 
 	/* Valid range: 1 - N ms */
 	if (zfs_arc_min_prescient_prefetch_ms) {
 		arc_min_prescient_prefetch_ms =
 		    zfs_arc_min_prescient_prefetch_ms;
 	}
 
 	/* Valid range: 0 - 100 */
 	if (zfs_arc_lotsfree_percent <= 100)
 		arc_lotsfree_percent = zfs_arc_lotsfree_percent;
 	WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent,
 	    verbose);
 
 	/* Valid range: 0 - <all physical memory> */
 	if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free))
 		arc_sys_free = MIN(zfs_arc_sys_free, allmem);
 	WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose);
 }
 
 static void
 arc_state_multilist_init(multilist_t *ml,
     multilist_sublist_index_func_t *index_func, int *maxcountp)
 {
 	multilist_create(ml, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), index_func);
 	*maxcountp = MAX(*maxcountp, multilist_get_num_sublists(ml));
 }
 
 static void
 arc_state_init(void)
 {
 	int num_sublists = 0;
 
 	arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 
 	/*
 	 * L2 headers should never be on the L2 state list since they don't
 	 * have L1 headers allocated.  Special index function asserts that.
 	 */
 	arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_l2c_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
 	    arc_state_l2c_multilist_index_func, &num_sublists);
 
 	/*
 	 * Keep track of the number of markers needed to reclaim buffers from
 	 * any ARC state.  The markers will be pre-allocated so as to minimize
 	 * the number of memory allocations performed by the eviction thread.
 	 */
 	arc_state_evict_marker_count = num_sublists;
 
 	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_DATA]);
 
 	zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_METADATA]);
 
 	wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA], 0);
 	wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA], 0);
 	wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA], 0);
 	wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA], 0);
 
 	wmsum_init(&arc_sums.arcstat_hits, 0);
 	wmsum_init(&arc_sums.arcstat_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_misses, 0);
 	wmsum_init(&arc_sums.arcstat_demand_data_hits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_data_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_data_misses, 0);
 	wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_metadata_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_data_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_metadata_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0);
 	wmsum_init(&arc_sums.arcstat_mru_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mfu_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0);
 	wmsum_init(&arc_sums.arcstat_uncached_hits, 0);
 	wmsum_init(&arc_sums.arcstat_deleted, 0);
 	wmsum_init(&arc_sums.arcstat_mutex_miss, 0);
 	wmsum_init(&arc_sums.arcstat_access_skip, 0);
 	wmsum_init(&arc_sums.arcstat_evict_skip, 0);
 	wmsum_init(&arc_sums.arcstat_evict_not_enough, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_cached, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_eligible, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mfu, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0);
 	wmsum_init(&arc_sums.arcstat_hash_collisions, 0);
 	wmsum_init(&arc_sums.arcstat_hash_chains, 0);
 	aggsum_init(&arc_sums.arcstat_size, 0);
 	wmsum_init(&arc_sums.arcstat_compressed_size, 0);
 	wmsum_init(&arc_sums.arcstat_uncompressed_size, 0);
 	wmsum_init(&arc_sums.arcstat_overhead_size, 0);
 	wmsum_init(&arc_sums.arcstat_hdr_size, 0);
 	wmsum_init(&arc_sums.arcstat_data_size, 0);
 	wmsum_init(&arc_sums.arcstat_metadata_size, 0);
 	wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
 	wmsum_init(&arc_sums.arcstat_dnode_size, 0);
 	wmsum_init(&arc_sums.arcstat_bonus_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_hits, 0);
 	wmsum_init(&arc_sums.arcstat_l2_misses, 0);
 	wmsum_init(&arc_sums.arcstat_l2_prefetch_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_mru_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_mfu_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_bufc_data_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_bufc_metadata_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_feeds, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rw_clash, 0);
 	wmsum_init(&arc_sums.arcstat_l2_read_bytes, 0);
 	wmsum_init(&arc_sums.arcstat_l2_write_bytes, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_sent, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_done, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_error, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_lock_retry, 0);
 	wmsum_init(&arc_sums.arcstat_l2_evict_lock_retry, 0);
 	wmsum_init(&arc_sums.arcstat_l2_evict_reading, 0);
 	wmsum_init(&arc_sums.arcstat_l2_evict_l1cached, 0);
 	wmsum_init(&arc_sums.arcstat_l2_free_on_write, 0);
 	wmsum_init(&arc_sums.arcstat_l2_abort_lowmem, 0);
 	wmsum_init(&arc_sums.arcstat_l2_cksum_bad, 0);
 	wmsum_init(&arc_sums.arcstat_l2_io_error, 0);
 	wmsum_init(&arc_sums.arcstat_l2_lsize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_psize, 0);
 	aggsum_init(&arc_sums.arcstat_l2_hdr_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_log_blk_writes, 0);
 	wmsum_init(&arc_sums.arcstat_l2_log_blk_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_log_blk_count, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_success, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_unsupported, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_io_errors, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_dh_errors, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_lowmem, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs_precached, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_log_blks, 0);
 	wmsum_init(&arc_sums.arcstat_memory_throttle_count, 0);
 	wmsum_init(&arc_sums.arcstat_memory_direct_count, 0);
 	wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0);
 	wmsum_init(&arc_sums.arcstat_prune, 0);
 	wmsum_init(&arc_sums.arcstat_meta_used, 0);
 	wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0);
 	wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_iohit_predictive_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_prescient_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_iohit_prescient_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_raw_size, 0);
 	wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0);
 	wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0);
 
 	arc_anon->arcs_state = ARC_STATE_ANON;
 	arc_mru->arcs_state = ARC_STATE_MRU;
 	arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
 	arc_mfu->arcs_state = ARC_STATE_MFU;
 	arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
 	arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
 	arc_uncached->arcs_state = ARC_STATE_UNCACHED;
 }
 
 static void
 arc_state_fini(void)
 {
 	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_DATA]);
 
 	zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_METADATA]);
 
 	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_DATA]);
 
 	wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]);
 	wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]);
 	wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]);
 	wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]);
 
 	wmsum_fini(&arc_sums.arcstat_hits);
 	wmsum_fini(&arc_sums.arcstat_iohits);
 	wmsum_fini(&arc_sums.arcstat_misses);
 	wmsum_fini(&arc_sums.arcstat_demand_data_hits);
 	wmsum_fini(&arc_sums.arcstat_demand_data_iohits);
 	wmsum_fini(&arc_sums.arcstat_demand_data_misses);
 	wmsum_fini(&arc_sums.arcstat_demand_metadata_hits);
 	wmsum_fini(&arc_sums.arcstat_demand_metadata_iohits);
 	wmsum_fini(&arc_sums.arcstat_demand_metadata_misses);
 	wmsum_fini(&arc_sums.arcstat_prefetch_data_hits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_data_iohits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_data_misses);
 	wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_metadata_iohits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses);
 	wmsum_fini(&arc_sums.arcstat_mru_hits);
 	wmsum_fini(&arc_sums.arcstat_mru_ghost_hits);
 	wmsum_fini(&arc_sums.arcstat_mfu_hits);
 	wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits);
 	wmsum_fini(&arc_sums.arcstat_uncached_hits);
 	wmsum_fini(&arc_sums.arcstat_deleted);
 	wmsum_fini(&arc_sums.arcstat_mutex_miss);
 	wmsum_fini(&arc_sums.arcstat_access_skip);
 	wmsum_fini(&arc_sums.arcstat_evict_skip);
 	wmsum_fini(&arc_sums.arcstat_evict_not_enough);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_cached);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_eligible);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mfu);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_skip);
 	wmsum_fini(&arc_sums.arcstat_hash_collisions);
 	wmsum_fini(&arc_sums.arcstat_hash_chains);
 	aggsum_fini(&arc_sums.arcstat_size);
 	wmsum_fini(&arc_sums.arcstat_compressed_size);
 	wmsum_fini(&arc_sums.arcstat_uncompressed_size);
 	wmsum_fini(&arc_sums.arcstat_overhead_size);
 	wmsum_fini(&arc_sums.arcstat_hdr_size);
 	wmsum_fini(&arc_sums.arcstat_data_size);
 	wmsum_fini(&arc_sums.arcstat_metadata_size);
 	wmsum_fini(&arc_sums.arcstat_dbuf_size);
 	wmsum_fini(&arc_sums.arcstat_dnode_size);
 	wmsum_fini(&arc_sums.arcstat_bonus_size);
 	wmsum_fini(&arc_sums.arcstat_l2_hits);
 	wmsum_fini(&arc_sums.arcstat_l2_misses);
 	wmsum_fini(&arc_sums.arcstat_l2_prefetch_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_mru_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_mfu_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_bufc_data_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_bufc_metadata_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_feeds);
 	wmsum_fini(&arc_sums.arcstat_l2_rw_clash);
 	wmsum_fini(&arc_sums.arcstat_l2_read_bytes);
 	wmsum_fini(&arc_sums.arcstat_l2_write_bytes);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_sent);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_done);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_error);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_lock_retry);
 	wmsum_fini(&arc_sums.arcstat_l2_evict_lock_retry);
 	wmsum_fini(&arc_sums.arcstat_l2_evict_reading);
 	wmsum_fini(&arc_sums.arcstat_l2_evict_l1cached);
 	wmsum_fini(&arc_sums.arcstat_l2_free_on_write);
 	wmsum_fini(&arc_sums.arcstat_l2_abort_lowmem);
 	wmsum_fini(&arc_sums.arcstat_l2_cksum_bad);
 	wmsum_fini(&arc_sums.arcstat_l2_io_error);
 	wmsum_fini(&arc_sums.arcstat_l2_lsize);
 	wmsum_fini(&arc_sums.arcstat_l2_psize);
 	aggsum_fini(&arc_sums.arcstat_l2_hdr_size);
 	wmsum_fini(&arc_sums.arcstat_l2_log_blk_writes);
 	wmsum_fini(&arc_sums.arcstat_l2_log_blk_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_log_blk_count);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_success);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_unsupported);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_io_errors);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_dh_errors);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_lowmem);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_size);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs_precached);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_log_blks);
 	wmsum_fini(&arc_sums.arcstat_memory_throttle_count);
 	wmsum_fini(&arc_sums.arcstat_memory_direct_count);
 	wmsum_fini(&arc_sums.arcstat_memory_indirect_count);
 	wmsum_fini(&arc_sums.arcstat_prune);
 	wmsum_fini(&arc_sums.arcstat_meta_used);
 	wmsum_fini(&arc_sums.arcstat_async_upgrade_sync);
 	wmsum_fini(&arc_sums.arcstat_predictive_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_iohit_predictive_prefetch);
 	wmsum_fini(&arc_sums.arcstat_prescient_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_iohit_prescient_prefetch);
 	wmsum_fini(&arc_sums.arcstat_raw_size);
 	wmsum_fini(&arc_sums.arcstat_cached_only_in_progress);
 	wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size);
 }
 
 uint64_t
 arc_target_bytes(void)
 {
 	return (arc_c);
 }
 
 void
 arc_set_limits(uint64_t allmem)
 {
 	/* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */
 	arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
 
 	/* How to set default max varies by platform. */
 	arc_c_max = arc_default_max(arc_c_min, allmem);
 }
 void
 arc_init(void)
 {
 	uint64_t percent, allmem = arc_all_memory();
 	mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t),
 	    offsetof(arc_evict_waiter_t, aew_node));
 
 	arc_min_prefetch_ms = 1000;
 	arc_min_prescient_prefetch_ms = 6000;
 
 #if defined(_KERNEL)
 	arc_lowmem_init();
 #endif
 
 	arc_set_limits(allmem);
 
 #ifdef _KERNEL
 	/*
 	 * If zfs_arc_max is non-zero at init, meaning it was set in the kernel
 	 * environment before the module was loaded, don't block setting the
 	 * maximum because it is less than arc_c_min, instead, reset arc_c_min
 	 * to a lower value.
 	 * zfs_arc_min will be handled by arc_tuning_update().
 	 */
 	if (zfs_arc_max != 0 && zfs_arc_max >= MIN_ARC_MAX &&
 	    zfs_arc_max < allmem) {
 		arc_c_max = zfs_arc_max;
 		if (arc_c_min >= arc_c_max) {
 			arc_c_min = MAX(zfs_arc_max / 2,
 			    2ULL << SPA_MAXBLOCKSHIFT);
 		}
 	}
 #else
 	/*
 	 * In userland, there's only the memory pressure that we artificially
 	 * create (see arc_available_memory()).  Don't let arc_c get too
 	 * small, because it can cause transactions to be larger than
 	 * arc_c, causing arc_tempreserve_space() to fail.
 	 */
 	arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT);
 #endif
 
 	arc_c = arc_c_min;
 	/*
 	 * 32-bit fixed point fractions of metadata from total ARC size,
 	 * MRU data from all data and MRU metadata from all metadata.
 	 */
 	arc_meta = (1ULL << 32) / 4;	/* Metadata is 25% of arc_c. */
 	arc_pd = (1ULL << 32) / 2;	/* Data MRU is 50% of data. */
 	arc_pm = (1ULL << 32) / 2;	/* Metadata MRU is 50% of metadata. */
 
 	percent = MIN(zfs_arc_dnode_limit_percent, 100);
 	arc_dnode_limit = arc_c_max * percent / 100;
 
 	/* Apply user specified tunings */
 	arc_tuning_update(B_TRUE);
 
 	/* if kmem_flags are set, lets try to use less memory */
 	if (kmem_debugging())
 		arc_c = arc_c / 2;
 	if (arc_c < arc_c_min)
 		arc_c = arc_c_min;
 
 	arc_register_hotplug();
 
 	arc_state_init();
 
 	buf_init();
 
 	list_create(&arc_prune_list, sizeof (arc_prune_t),
 	    offsetof(arc_prune_t, p_node));
 	mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
 	    defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 
 	if (arc_ksp != NULL) {
 		arc_ksp->ks_data = &arc_stats;
 		arc_ksp->ks_update = arc_kstat_update;
 		kstat_install(arc_ksp);
 	}
 
 	arc_state_evict_markers =
 	    arc_state_alloc_markers(arc_state_evict_marker_count);
 	arc_evict_zthr = zthr_create_timer("arc_evict",
 	    arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1), defclsyspri);
 	arc_reap_zthr = zthr_create_timer("arc_reap",
 	    arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri);
 
 	arc_warm = B_FALSE;
 
 	/*
 	 * Calculate maximum amount of dirty data per pool.
 	 *
 	 * If it has been set by a module parameter, take that.
 	 * Otherwise, use a percentage of physical memory defined by
 	 * zfs_dirty_data_max_percent (default 10%) with a cap at
 	 * zfs_dirty_data_max_max (default 4G or 25% of physical memory).
 	 */
 #ifdef __LP64__
 	if (zfs_dirty_data_max_max == 0)
 		zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
 		    allmem * zfs_dirty_data_max_max_percent / 100);
 #else
 	if (zfs_dirty_data_max_max == 0)
 		zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024,
 		    allmem * zfs_dirty_data_max_max_percent / 100);
 #endif
 
 	if (zfs_dirty_data_max == 0) {
 		zfs_dirty_data_max = allmem *
 		    zfs_dirty_data_max_percent / 100;
 		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
 		    zfs_dirty_data_max_max);
 	}
 
 	if (zfs_wrlog_data_max == 0) {
 
 		/*
 		 * dp_wrlog_total is reduced for each txg at the end of
 		 * spa_sync(). However, dp_dirty_total is reduced every time
 		 * a block is written out. Thus under normal operation,
 		 * dp_wrlog_total could grow 2 times as big as
 		 * zfs_dirty_data_max.
 		 */
 		zfs_wrlog_data_max = zfs_dirty_data_max * 2;
 	}
 }
 
 void
 arc_fini(void)
 {
 	arc_prune_t *p;
 
 #ifdef _KERNEL
 	arc_lowmem_fini();
 #endif /* _KERNEL */
 
 	/* Use B_TRUE to ensure *all* buffers are evicted */
 	arc_flush(NULL, B_TRUE);
 
 	if (arc_ksp != NULL) {
 		kstat_delete(arc_ksp);
 		arc_ksp = NULL;
 	}
 
 	taskq_wait(arc_prune_taskq);
 	taskq_destroy(arc_prune_taskq);
 
 	mutex_enter(&arc_prune_mtx);
 	while ((p = list_remove_head(&arc_prune_list)) != NULL) {
 		(void) zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
 		zfs_refcount_destroy(&p->p_refcnt);
 		kmem_free(p, sizeof (*p));
 	}
 	mutex_exit(&arc_prune_mtx);
 
 	list_destroy(&arc_prune_list);
 	mutex_destroy(&arc_prune_mtx);
 
 	(void) zthr_cancel(arc_evict_zthr);
 	(void) zthr_cancel(arc_reap_zthr);
 	arc_state_free_markers(arc_state_evict_markers,
 	    arc_state_evict_marker_count);
 
 	mutex_destroy(&arc_evict_lock);
 	list_destroy(&arc_evict_waiters);
 
 	/*
 	 * Free any buffers that were tagged for destruction.  This needs
 	 * to occur before arc_state_fini() runs and destroys the aggsum
 	 * values which are updated when freeing scatter ABDs.
 	 */
 	l2arc_do_free_on_write();
 
 	/*
 	 * buf_fini() must proceed arc_state_fini() because buf_fin() may
 	 * trigger the release of kmem magazines, which can callback to
 	 * arc_space_return() which accesses aggsums freed in act_state_fini().
 	 */
 	buf_fini();
 	arc_state_fini();
 
 	arc_unregister_hotplug();
 
 	/*
 	 * We destroy the zthrs after all the ARC state has been
 	 * torn down to avoid the case of them receiving any
 	 * wakeup() signals after they are destroyed.
 	 */
 	zthr_destroy(arc_evict_zthr);
 	zthr_destroy(arc_reap_zthr);
 
 	ASSERT0(arc_loaned_bytes);
 }
 
 /*
  * Level 2 ARC
  *
  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
  * It uses dedicated storage devices to hold cached data, which are populated
  * using large infrequent writes.  The main role of this cache is to boost
  * the performance of random read workloads.  The intended L2ARC devices
  * include short-stroked disks, solid state disks, and other media with
  * substantially faster read latency than disk.
  *
  *                 +-----------------------+
  *                 |         ARC           |
  *                 +-----------------------+
  *                    |         ^     ^
  *                    |         |     |
  *      l2arc_feed_thread()    arc_read()
  *                    |         |     |
  *                    |  l2arc read   |
  *                    V         |     |
  *               +---------------+    |
  *               |     L2ARC     |    |
  *               +---------------+    |
  *                   |    ^           |
  *          l2arc_write() |           |
  *                   |    |           |
  *                   V    |           |
  *                 +-------+      +-------+
  *                 | vdev  |      | vdev  |
  *                 | cache |      | cache |
  *                 +-------+      +-------+
  *                 +=========+     .-----.
  *                 :  L2ARC  :    |-_____-|
  *                 : devices :    | Disks |
  *                 +=========+    `-_____-'
  *
  * Read requests are satisfied from the following sources, in order:
  *
  *	1) ARC
  *	2) vdev cache of L2ARC devices
  *	3) L2ARC devices
  *	4) vdev cache of disks
  *	5) disks
  *
  * Some L2ARC device types exhibit extremely slow write performance.
  * To accommodate for this there are some significant differences between
  * the L2ARC and traditional cache design:
  *
  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
  * the ARC behave as usual, freeing buffers and placing headers on ghost
  * lists.  The ARC does not send buffers to the L2ARC during eviction as
  * this would add inflated write latencies for all ARC memory pressure.
  *
  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
  * It does this by periodically scanning buffers from the eviction-end of
  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
  * not already there. It scans until a headroom of buffers is satisfied,
  * which itself is a buffer for ARC eviction. If a compressible buffer is
  * found during scanning and selected for writing to an L2ARC device, we
  * temporarily boost scanning headroom during the next scan cycle to make
  * sure we adapt to compression effects (which might significantly reduce
  * the data volume we write to L2ARC). The thread that does this is
  * l2arc_feed_thread(), illustrated below; example sizes are included to
  * provide a better sense of ratio than this diagram:
  *
  *	       head -->                        tail
  *	        +---------------------+----------+
  *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
  *	        +---------------------+----------+   |   o L2ARC eligible
  *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
  *	        +---------------------+----------+   |
  *	             15.9 Gbytes      ^ 32 Mbytes    |
  *	                           headroom          |
  *	                                      l2arc_feed_thread()
  *	                                             |
  *	                 l2arc write hand <--[oooo]--'
  *	                         |           8 Mbyte
  *	                         |          write max
  *	                         V
  *		  +==============================+
  *	L2ARC dev |####|#|###|###|    |####| ... |
  *	          +==============================+
  *	                     32 Gbytes
  *
  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
  * evicted, then the L2ARC has cached a buffer much sooner than it probably
  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
  * safe to say that this is an uncommon case, since buffers at the end of
  * the ARC lists have moved there due to inactivity.
  *
  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
  * then the L2ARC simply misses copying some buffers.  This serves as a
  * pressure valve to prevent heavy read workloads from both stalling the ARC
  * with waits and clogging the L2ARC with writes.  This also helps prevent
  * the potential for the L2ARC to churn if it attempts to cache content too
  * quickly, such as during backups of the entire pool.
  *
  * 5. After system boot and before the ARC has filled main memory, there are
  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
  * lists can remain mostly static.  Instead of searching from tail of these
  * lists as pictured, the l2arc_feed_thread() will search from the list heads
  * for eligible buffers, greatly increasing its chance of finding them.
  *
  * The L2ARC device write speed is also boosted during this time so that
  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
  * there are no L2ARC reads, and no fear of degrading read performance
  * through increased writes.
  *
  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
  * the vdev queue can aggregate them into larger and fewer writes.  Each
  * device is written to in a rotor fashion, sweeping writes through
  * available space then repeating.
  *
  * 7. The L2ARC does not store dirty content.  It never needs to flush
  * write buffers back to disk based storage.
  *
  * 8. If an ARC buffer is written (and dirtied) which also exists in the
  * L2ARC, the now stale L2ARC buffer is immediately dropped.
  *
  * The performance of the L2ARC can be tweaked by a number of tunables, which
  * may be necessary for different workloads:
  *
  *	l2arc_write_max		max write bytes per interval
  *	l2arc_write_boost	extra write bytes during device warmup
  *	l2arc_noprefetch	skip caching prefetched buffers
  *	l2arc_headroom		number of max device writes to precache
  *	l2arc_headroom_boost	when we find compressed buffers during ARC
  *				scanning, we multiply headroom by this
  *				percentage factor for the next scan cycle,
  *				since more compressed buffers are likely to
  *				be present
  *	l2arc_feed_secs		seconds between L2ARC writing
  *
  * Tunables may be removed or added as future performance improvements are
  * integrated, and also may become zpool properties.
  *
  * There are three key functions that control how the L2ARC warms up:
  *
  *	l2arc_write_eligible()	check if a buffer is eligible to cache
  *	l2arc_write_size()	calculate how much to write
  *	l2arc_write_interval()	calculate sleep delay between writes
  *
  * These three functions determine what to write, how much, and how quickly
  * to send writes.
  *
  * L2ARC persistence:
  *
  * When writing buffers to L2ARC, we periodically add some metadata to
  * make sure we can pick them up after reboot, thus dramatically reducing
  * the impact that any downtime has on the performance of storage systems
  * with large caches.
  *
  * The implementation works fairly simply by integrating the following two
  * modifications:
  *
  * *) When writing to the L2ARC, we occasionally write a "l2arc log block",
  *    which is an additional piece of metadata which describes what's been
  *    written. This allows us to rebuild the arc_buf_hdr_t structures of the
  *    main ARC buffers. There are 2 linked-lists of log blocks headed by
  *    dh_start_lbps[2]. We alternate which chain we append to, so they are
  *    time-wise and offset-wise interleaved, but that is an optimization rather
  *    than for correctness. The log block also includes a pointer to the
  *    previous block in its chain.
  *
  * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
  *    for our header bookkeeping purposes. This contains a device header,
  *    which contains our top-level reference structures. We update it each
  *    time we write a new log block, so that we're able to locate it in the
  *    L2ARC device. If this write results in an inconsistent device header
  *    (e.g. due to power failure), we detect this by verifying the header's
  *    checksum and simply fail to reconstruct the L2ARC after reboot.
  *
  * Implementation diagram:
  *
  * +=== L2ARC device (not to scale) ======================================+
  * |       ___two newest log block pointers__.__________                  |
  * |      /                                   \dh_start_lbps[1]           |
  * |	 /				       \         \dh_start_lbps[0]|
  * |.___/__.                                    V         V               |
  * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
  * ||   hdr|      ^         /^       /^        /         /                |
  * |+------+  ...--\-------/  \-----/--\------/         /                 |
  * |                \--------------/    \--------------/                  |
  * +======================================================================+
  *
  * As can be seen on the diagram, rather than using a simple linked list,
  * we use a pair of linked lists with alternating elements. This is a
  * performance enhancement due to the fact that we only find out the
  * address of the next log block access once the current block has been
  * completely read in. Obviously, this hurts performance, because we'd be
  * keeping the device's I/O queue at only a 1 operation deep, thus
  * incurring a large amount of I/O round-trip latency. Having two lists
  * allows us to fetch two log blocks ahead of where we are currently
  * rebuilding L2ARC buffers.
  *
  * On-device data structures:
  *
  * L2ARC device header:	l2arc_dev_hdr_phys_t
  * L2ARC log block:	l2arc_log_blk_phys_t
  *
  * L2ARC reconstruction:
  *
  * When writing data, we simply write in the standard rotary fashion,
  * evicting buffers as we go and simply writing new data over them (writing
  * a new log block every now and then). This obviously means that once we
  * loop around the end of the device, we will start cutting into an already
  * committed log block (and its referenced data buffers), like so:
  *
  *    current write head__       __old tail
  *                        \     /
  *                        V    V
  * <--|bufs |lb |bufs |lb |    |bufs |lb |bufs |lb |-->
  *                         ^    ^^^^^^^^^___________________________________
  *                         |                                                \
  *                   <<nextwrite>> may overwrite this blk and/or its bufs --'
  *
  * When importing the pool, we detect this situation and use it to stop
  * our scanning process (see l2arc_rebuild).
  *
  * There is one significant caveat to consider when rebuilding ARC contents
  * from an L2ARC device: what about invalidated buffers? Given the above
  * construction, we cannot update blocks which we've already written to amend
  * them to remove buffers which were invalidated. Thus, during reconstruction,
  * we might be populating the cache with buffers for data that's not on the
  * main pool anymore, or may have been overwritten!
  *
  * As it turns out, this isn't a problem. Every arc_read request includes
  * both the DVA and, crucially, the birth TXG of the BP the caller is
  * looking for. So even if the cache were populated by completely rotten
  * blocks for data that had been long deleted and/or overwritten, we'll
  * never actually return bad data from the cache, since the DVA with the
  * birth TXG uniquely identify a block in space and time - once created,
  * a block is immutable on disk. The worst thing we have done is wasted
  * some time and memory at l2arc rebuild to reconstruct outdated ARC
  * entries that will get dropped from the l2arc as it is being updated
  * with new blocks.
  *
  * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write
  * hand are not restored. This is done by saving the offset (in bytes)
  * l2arc_evict() has evicted to in the L2ARC device header and taking it
  * into account when restoring buffers.
  */
 
 static boolean_t
 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
 {
 	/*
 	 * A buffer is *not* eligible for the L2ARC if it:
 	 * 1. belongs to a different spa.
 	 * 2. is already cached on the L2ARC.
 	 * 3. has an I/O in progress (it may be an incomplete read).
 	 * 4. is flagged not eligible (zfs property).
 	 */
 	if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
 	    HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 static uint64_t
 l2arc_write_size(l2arc_dev_t *dev)
 {
 	uint64_t size;
 
 	/*
 	 * Make sure our globals have meaningful values in case the user
 	 * altered them.
 	 */
 	size = l2arc_write_max;
 	if (size == 0) {
 		cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, "
 		    "resetting it to the default (%d)", L2ARC_WRITE_SIZE);
 		size = l2arc_write_max = L2ARC_WRITE_SIZE;
 	}
 
 	if (arc_warm == B_FALSE)
 		size += l2arc_write_boost;
 
 	/* We need to add in the worst case scenario of log block overhead. */
 	size += l2arc_log_blk_overhead(size, dev);
 	if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
 		/*
 		 * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
 		 * times the writesize, whichever is greater.
 		 */
 		size += MAX(64 * 1024 * 1024,
 		    (size * l2arc_trim_ahead) / 100);
 	}
 
 	/*
 	 * Make sure the write size does not exceed the size of the cache
 	 * device. This is important in l2arc_evict(), otherwise infinite
 	 * iteration can occur.
 	 */
 	size = MIN(size, (dev->l2ad_end - dev->l2ad_start) / 4);
 
 	size = P2ROUNDUP(size, 1ULL << dev->l2ad_vdev->vdev_ashift);
 
 	return (size);
 
 }
 
 static clock_t
 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
 {
 	clock_t interval, next, now;
 
 	/*
 	 * If the ARC lists are busy, increase our write rate; if the
 	 * lists are stale, idle back.  This is achieved by checking
 	 * how much we previously wrote - if it was more than half of
 	 * what we wanted, schedule the next write much sooner.
 	 */
 	if (l2arc_feed_again && wrote > (wanted / 2))
 		interval = (hz * l2arc_feed_min_ms) / 1000;
 	else
 		interval = hz * l2arc_feed_secs;
 
 	now = ddi_get_lbolt();
 	next = MAX(now, MIN(now + interval, began + interval));
 
 	return (next);
 }
 
 /*
  * Cycle through L2ARC devices.  This is how L2ARC load balances.
  * If a device is returned, this also returns holding the spa config lock.
  */
 static l2arc_dev_t *
 l2arc_dev_get_next(void)
 {
 	l2arc_dev_t *first, *next = NULL;
 
 	/*
 	 * Lock out the removal of spas (spa_namespace_lock), then removal
 	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
 	 * both locks will be dropped and a spa config lock held instead.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	mutex_enter(&l2arc_dev_mtx);
 
 	/* if there are no vdevs, there is nothing to do */
 	if (l2arc_ndev == 0)
 		goto out;
 
 	first = NULL;
 	next = l2arc_dev_last;
 	do {
 		/* loop around the list looking for a non-faulted vdev */
 		if (next == NULL) {
 			next = list_head(l2arc_dev_list);
 		} else {
 			next = list_next(l2arc_dev_list, next);
 			if (next == NULL)
 				next = list_head(l2arc_dev_list);
 		}
 
 		/* if we have come back to the start, bail out */
 		if (first == NULL)
 			first = next;
 		else if (next == first)
 			break;
 
 		ASSERT3P(next, !=, NULL);
 	} while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
 	    next->l2ad_trim_all);
 
 	/* if we were unable to find any usable vdevs, return NULL */
 	if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
 	    next->l2ad_trim_all)
 		next = NULL;
 
 	l2arc_dev_last = next;
 
 out:
 	mutex_exit(&l2arc_dev_mtx);
 
 	/*
 	 * Grab the config lock to prevent the 'next' device from being
 	 * removed while we are writing to it.
 	 */
 	if (next != NULL)
 		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
 	mutex_exit(&spa_namespace_lock);
 
 	return (next);
 }
 
 /*
  * Free buffers that were tagged for destruction.
  */
 static void
 l2arc_do_free_on_write(void)
 {
 	l2arc_data_free_t *df;
 
 	mutex_enter(&l2arc_free_on_write_mtx);
 	while ((df = list_remove_head(l2arc_free_on_write)) != NULL) {
 		ASSERT3P(df->l2df_abd, !=, NULL);
 		abd_free(df->l2df_abd);
 		kmem_free(df, sizeof (l2arc_data_free_t));
 	}
 	mutex_exit(&l2arc_free_on_write_mtx);
 }
 
 /*
  * A write to a cache device has completed.  Update all headers to allow
  * reads from these buffers to begin.
  */
 static void
 l2arc_write_done(zio_t *zio)
 {
 	l2arc_write_callback_t	*cb;
 	l2arc_lb_abd_buf_t	*abd_buf;
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 	l2arc_dev_t		*dev;
 	l2arc_dev_hdr_phys_t	*l2dhdr;
 	list_t			*buflist;
 	arc_buf_hdr_t		*head, *hdr, *hdr_prev;
 	kmutex_t		*hash_lock;
 	int64_t			bytes_dropped = 0;
 
 	cb = zio->io_private;
 	ASSERT3P(cb, !=, NULL);
 	dev = cb->l2wcb_dev;
 	l2dhdr = dev->l2ad_dev_hdr;
 	ASSERT3P(dev, !=, NULL);
 	head = cb->l2wcb_head;
 	ASSERT3P(head, !=, NULL);
 	buflist = &dev->l2ad_buflist;
 	ASSERT3P(buflist, !=, NULL);
 	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
 	    l2arc_write_callback_t *, cb);
 
 	/*
 	 * All writes completed, or an error was hit.
 	 */
 top:
 	mutex_enter(&dev->l2ad_mtx);
 	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
 		hdr_prev = list_prev(buflist, hdr);
 
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We cannot use mutex_enter or else we can deadlock
 		 * with l2arc_write_buffers (due to swapping the order
 		 * the hash lock and l2ad_mtx are taken).
 		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * Missed the hash lock. We must retry so we
 			 * don't leave the ARC_FLAG_L2_WRITING bit set.
 			 */
 			ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
 
 			/*
 			 * We don't want to rescan the headers we've
 			 * already marked as having been written out, so
 			 * we reinsert the head node so we can pick up
 			 * where we left off.
 			 */
 			list_remove(buflist, head);
 			list_insert_after(buflist, hdr, head);
 
 			mutex_exit(&dev->l2ad_mtx);
 
 			/*
 			 * We wait for the hash lock to become available
 			 * to try and prevent busy waiting, and increase
 			 * the chance we'll be able to acquire the lock
 			 * the next time around.
 			 */
 			mutex_enter(hash_lock);
 			mutex_exit(hash_lock);
 			goto top;
 		}
 
 		/*
 		 * We could not have been moved into the arc_l2c_only
 		 * state while in-flight due to our ARC_FLAG_L2_WRITING
 		 * bit being set. Let's just ensure that's being enforced.
 		 */
 		ASSERT(HDR_HAS_L1HDR(hdr));
 
 		/*
 		 * Skipped - drop L2ARC entry and mark the header as no
 		 * longer L2 eligibile.
 		 */
 		if (zio->io_error != 0) {
 			/*
 			 * Error - drop L2ARC entry.
 			 */
 			list_remove(buflist, hdr);
 			arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
 
 			uint64_t psize = HDR_GET_PSIZE(hdr);
 			l2arc_hdr_arcstats_decrement(hdr);
 
 			bytes_dropped +=
 			    vdev_psize_to_asize(dev->l2ad_vdev, psize);
 			(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
 			    arc_hdr_size(hdr), hdr);
 		}
 
 		/*
 		 * Allow ARC to begin reads and ghost list evictions to
 		 * this L2ARC entry.
 		 */
 		arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
 
 		mutex_exit(hash_lock);
 	}
 
 	/*
 	 * Free the allocated abd buffers for writing the log blocks.
 	 * If the zio failed reclaim the allocated space and remove the
 	 * pointers to these log blocks from the log block pointer list
 	 * of the L2ARC device.
 	 */
 	while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) {
 		abd_free(abd_buf->abd);
 		zio_buf_free(abd_buf, sizeof (*abd_buf));
 		if (zio->io_error != 0) {
 			lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list);
 			/*
 			 * L2BLK_GET_PSIZE returns aligned size for log
 			 * blocks.
 			 */
 			uint64_t asize =
 			    L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop);
 			bytes_dropped += asize;
 			ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
 			ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
 			zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
 			    lb_ptr_buf);
 			(void) zfs_refcount_remove(&dev->l2ad_lb_count,
 			    lb_ptr_buf);
 			kmem_free(lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
 		}
 	}
 	list_destroy(&cb->l2wcb_abd_list);
 
 	if (zio->io_error != 0) {
 		ARCSTAT_BUMP(arcstat_l2_writes_error);
 
 		/*
 		 * Restore the lbps array in the header to its previous state.
 		 * If the list of log block pointers is empty, zero out the
 		 * log block pointers in the device header.
 		 */
 		lb_ptr_buf = list_head(&dev->l2ad_lbptr_list);
 		for (int i = 0; i < 2; i++) {
 			if (lb_ptr_buf == NULL) {
 				/*
 				 * If the list is empty zero out the device
 				 * header. Otherwise zero out the second log
 				 * block pointer in the header.
 				 */
 				if (i == 0) {
 					memset(l2dhdr, 0,
 					    dev->l2ad_dev_hdr_asize);
 				} else {
 					memset(&l2dhdr->dh_start_lbps[i], 0,
 					    sizeof (l2arc_log_blkptr_t));
 				}
 				break;
 			}
 			memcpy(&l2dhdr->dh_start_lbps[i], lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			lb_ptr_buf = list_next(&dev->l2ad_lbptr_list,
 			    lb_ptr_buf);
 		}
 	}
 
 	ARCSTAT_BUMP(arcstat_l2_writes_done);
 	list_remove(buflist, head);
 	ASSERT(!HDR_HAS_L1HDR(head));
 	kmem_cache_free(hdr_l2only_cache, head);
 	mutex_exit(&dev->l2ad_mtx);
 
 	ASSERT(dev->l2ad_vdev != NULL);
 	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
 
 	l2arc_do_free_on_write();
 
 	kmem_free(cb, sizeof (l2arc_write_callback_t));
 }
 
 static int
 l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
 {
 	int ret;
 	spa_t *spa = zio->io_spa;
 	arc_buf_hdr_t *hdr = cb->l2rcb_hdr;
 	blkptr_t *bp = zio->io_bp;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	/*
 	 * ZIL data is never be written to the L2ARC, so we don't need
 	 * special handling for its unique MAC storage.
 	 */
 	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	/*
 	 * If the data was encrypted, decrypt it now. Note that
 	 * we must check the bp here and not the hdr, since the
 	 * hdr does not have its encryption parameters updated
 	 * until arc_read_done().
 	 */
 	if (BP_IS_ENCRYPTED(bp)) {
 		abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
 		    ARC_HDR_USE_RESERVE);
 
 		zio_crypt_decode_params_bp(bp, salt, iv);
 		zio_crypt_decode_mac_bp(bp, mac);
 
 		ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb,
 		    BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
 		    salt, iv, mac, HDR_GET_PSIZE(hdr), eabd,
 		    hdr->b_l1hdr.b_pabd, &no_crypt);
 		if (ret != 0) {
 			arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
 			goto error;
 		}
 
 		/*
 		 * If we actually performed decryption, replace b_pabd
 		 * with the decrypted data. Otherwise we can just throw
 		 * our decryption buffer away.
 		 */
 		if (!no_crypt) {
 			arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 			    arc_hdr_size(hdr), hdr);
 			hdr->b_l1hdr.b_pabd = eabd;
 			zio->io_abd = eabd;
 		} else {
 			arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
 		}
 	}
 
 	/*
 	 * If the L2ARC block was compressed, but ARC compression
 	 * is disabled we decompress the data into a new buffer and
 	 * replace the existing data.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
 		    ARC_HDR_USE_RESERVE);
 		void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
 			abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 			arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
 			goto error;
 		}
 
 		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
 		zio->io_abd = cabd;
 		zio->io_size = HDR_GET_LSIZE(hdr);
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 
 /*
  * A read to a cache device completed.  Validate buffer contents before
  * handing over to the regular ARC routines.
  */
 static void
 l2arc_read_done(zio_t *zio)
 {
 	int tfm_error = 0;
 	l2arc_read_callback_t *cb = zio->io_private;
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	boolean_t valid_cksum;
 	boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) &&
 	    (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT));
 
 	ASSERT3P(zio->io_vd, !=, NULL);
 	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
 
 	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
 
 	ASSERT3P(cb, !=, NULL);
 	hdr = cb->l2rcb_hdr;
 	ASSERT3P(hdr, !=, NULL);
 
 	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
 	/*
 	 * If the data was read into a temporary buffer,
 	 * move it and free the buffer.
 	 */
 	if (cb->l2rcb_abd != NULL) {
 		ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
 		if (zio->io_error == 0) {
 			if (using_rdata) {
 				abd_copy(hdr->b_crypt_hdr.b_rabd,
 				    cb->l2rcb_abd, arc_hdr_size(hdr));
 			} else {
 				abd_copy(hdr->b_l1hdr.b_pabd,
 				    cb->l2rcb_abd, arc_hdr_size(hdr));
 			}
 		}
 
 		/*
 		 * The following must be done regardless of whether
 		 * there was an error:
 		 * - free the temporary buffer
 		 * - point zio to the real ARC buffer
 		 * - set zio size accordingly
 		 * These are required because zio is either re-used for
 		 * an I/O of the block in the case of the error
 		 * or the zio is passed to arc_read_done() and it
 		 * needs real data.
 		 */
 		abd_free(cb->l2rcb_abd);
 		zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
 
 		if (using_rdata) {
 			ASSERT(HDR_HAS_RABD(hdr));
 			zio->io_abd = zio->io_orig_abd =
 			    hdr->b_crypt_hdr.b_rabd;
 		} else {
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
 		}
 	}
 
 	ASSERT3P(zio->io_abd, !=, NULL);
 
 	/*
 	 * Check this survived the L2ARC journey.
 	 */
 	ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd ||
 	    (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd));
 	zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
 	zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
 	zio->io_prop.zp_complevel = hdr->b_complevel;
 
 	valid_cksum = arc_cksum_is_equal(hdr, zio);
 
 	/*
 	 * b_rabd will always match the data as it exists on disk if it is
 	 * being used. Therefore if we are reading into b_rabd we do not
 	 * attempt to untransform the data.
 	 */
 	if (valid_cksum && !using_rdata)
 		tfm_error = l2arc_untransform(zio, cb);
 
 	if (valid_cksum && tfm_error == 0 && zio->io_error == 0 &&
 	    !HDR_L2_EVICTED(hdr)) {
 		mutex_exit(hash_lock);
 		zio->io_private = hdr;
 		arc_read_done(zio);
 	} else {
 		/*
 		 * Buffer didn't survive caching.  Increment stats and
 		 * reissue to the original storage device.
 		 */
 		if (zio->io_error != 0) {
 			ARCSTAT_BUMP(arcstat_l2_io_error);
 		} else {
 			zio->io_error = SET_ERROR(EIO);
 		}
 		if (!valid_cksum || tfm_error != 0)
 			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
 
 		/*
 		 * If there's no waiter, issue an async i/o to the primary
 		 * storage now.  If there *is* a waiter, the caller must
 		 * issue the i/o in a context where it's OK to block.
 		 */
 		if (zio->io_waiter == NULL) {
 			zio_t *pio = zio_unique_parent(zio);
 			void *abd = (using_rdata) ?
 			    hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd;
 
 			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
 
 			zio = zio_read(pio, zio->io_spa, zio->io_bp,
 			    abd, zio->io_size, arc_read_done,
 			    hdr, zio->io_priority, cb->l2rcb_flags,
 			    &cb->l2rcb_zb);
 
 			/*
 			 * Original ZIO will be freed, so we need to update
 			 * ARC header with the new ZIO pointer to be used
 			 * by zio_change_priority() in arc_read().
 			 */
 			for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
 			    acb != NULL; acb = acb->acb_next)
 				acb->acb_zio_head = zio;
 
 			mutex_exit(hash_lock);
 			zio_nowait(zio);
 		} else {
 			mutex_exit(hash_lock);
 		}
 	}
 
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
 }
 
 /*
  * This is the list priority from which the L2ARC will search for pages to
  * cache.  This is used within loops (0..3) to cycle through lists in the
  * desired order.  This order can have a significant effect on cache
  * performance.
  *
  * Currently the metadata lists are hit first, MFU then MRU, followed by
  * the data lists.  This function returns a locked list, and also returns
  * the lock pointer.
  */
 static multilist_sublist_t *
 l2arc_sublist_lock(int list_num)
 {
 	multilist_t *ml = NULL;
 	unsigned int idx;
 
 	ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES);
 
 	switch (list_num) {
 	case 0:
 		ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
 		break;
 	case 1:
 		ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
 		break;
 	case 2:
 		ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
 		break;
 	case 3:
 		ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
 		break;
 	default:
 		return (NULL);
 	}
 
 	/*
 	 * Return a randomly-selected sublist. This is acceptable
 	 * because the caller feeds only a little bit of data for each
 	 * call (8MB). Subsequent calls will result in different
 	 * sublists being selected.
 	 */
 	idx = multilist_get_random_index(ml);
 	return (multilist_sublist_lock(ml, idx));
 }
 
 /*
  * Calculates the maximum overhead of L2ARC metadata log blocks for a given
  * L2ARC write size. l2arc_evict and l2arc_write_size need to include this
  * overhead in processing to make sure there is enough headroom available
  * when writing buffers.
  */
 static inline uint64_t
 l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
 {
 	if (dev->l2ad_log_entries == 0) {
 		return (0);
 	} else {
 		uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT;
 
 		uint64_t log_blocks = (log_entries +
 		    dev->l2ad_log_entries - 1) /
 		    dev->l2ad_log_entries;
 
 		return (vdev_psize_to_asize(dev->l2ad_vdev,
 		    sizeof (l2arc_log_blk_phys_t)) * log_blocks);
 	}
 }
 
 /*
  * Evict buffers from the device write hand to the distance specified in
  * bytes. This distance may span populated buffers, it may span nothing.
  * This is clearing a region on the L2ARC device ready for writing.
  * If the 'all' boolean is set, every buffer is evicted.
  */
 static void
 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
 {
 	list_t *buflist;
 	arc_buf_hdr_t *hdr, *hdr_prev;
 	kmutex_t *hash_lock;
 	uint64_t taddr;
 	l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
 	vdev_t *vd = dev->l2ad_vdev;
 	boolean_t rerun;
 
 	buflist = &dev->l2ad_buflist;
 
 top:
 	rerun = B_FALSE;
 	if (dev->l2ad_hand + distance > dev->l2ad_end) {
 		/*
 		 * When there is no space to accommodate upcoming writes,
 		 * evict to the end. Then bump the write and evict hands
 		 * to the start and iterate. This iteration does not
 		 * happen indefinitely as we make sure in
 		 * l2arc_write_size() that when the write hand is reset,
 		 * the write size does not exceed the end of the device.
 		 */
 		rerun = B_TRUE;
 		taddr = dev->l2ad_end;
 	} else {
 		taddr = dev->l2ad_hand + distance;
 	}
 	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
 	    uint64_t, taddr, boolean_t, all);
 
 	if (!all) {
 		/*
 		 * This check has to be placed after deciding whether to
 		 * iterate (rerun).
 		 */
 		if (dev->l2ad_first) {
 			/*
 			 * This is the first sweep through the device. There is
 			 * nothing to evict. We have already trimmmed the
 			 * whole device.
 			 */
 			goto out;
 		} else {
 			/*
 			 * Trim the space to be evicted.
 			 */
 			if (vd->vdev_has_trim && dev->l2ad_evict < taddr &&
 			    l2arc_trim_ahead > 0) {
 				/*
 				 * We have to drop the spa_config lock because
 				 * vdev_trim_range() will acquire it.
 				 * l2ad_evict already accounts for the label
 				 * size. To prevent vdev_trim_ranges() from
 				 * adding it again, we subtract it from
 				 * l2ad_evict.
 				 */
 				spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev);
 				vdev_trim_simple(vd,
 				    dev->l2ad_evict - VDEV_LABEL_START_SIZE,
 				    taddr - dev->l2ad_evict);
 				spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev,
 				    RW_READER);
 			}
 
 			/*
 			 * When rebuilding L2ARC we retrieve the evict hand
 			 * from the header of the device. Of note, l2arc_evict()
 			 * does not actually delete buffers from the cache
 			 * device, but trimming may do so depending on the
 			 * hardware implementation. Thus keeping track of the
 			 * evict hand is useful.
 			 */
 			dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
 		}
 	}
 
 retry:
 	mutex_enter(&dev->l2ad_mtx);
 	/*
 	 * We have to account for evicted log blocks. Run vdev_space_update()
 	 * on log blocks whose offset (in bytes) is before the evicted offset
 	 * (in bytes) by searching in the list of pointers to log blocks
 	 * present in the L2ARC device.
 	 */
 	for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf;
 	    lb_ptr_buf = lb_ptr_buf_prev) {
 
 		lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf);
 
 		/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 		uint64_t asize = L2BLK_GET_PSIZE(
 		    (lb_ptr_buf->lb_ptr)->lbp_prop);
 
 		/*
 		 * We don't worry about log blocks left behind (ie
 		 * lbp_payload_start < l2ad_hand) because l2arc_write_buffers()
 		 * will never write more than l2arc_evict() evicts.
 		 */
 		if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
 			break;
 		} else {
 			vdev_space_update(vd, -asize, 0, 0);
 			ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
 			ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
 			zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
 			    lb_ptr_buf);
 			(void) zfs_refcount_remove(&dev->l2ad_lb_count,
 			    lb_ptr_buf);
 			list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
 			kmem_free(lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
 		}
 	}
 
 	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
 		hdr_prev = list_prev(buflist, hdr);
 
 		ASSERT(!HDR_EMPTY(hdr));
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We cannot use mutex_enter or else we can deadlock
 		 * with l2arc_write_buffers (due to swapping the order
 		 * the hash lock and l2ad_mtx are taken).
 		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * Missed the hash lock.  Retry.
 			 */
 			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
 			mutex_exit(&dev->l2ad_mtx);
 			mutex_enter(hash_lock);
 			mutex_exit(hash_lock);
 			goto retry;
 		}
 
 		/*
 		 * A header can't be on this list if it doesn't have L2 header.
 		 */
 		ASSERT(HDR_HAS_L2HDR(hdr));
 
 		/* Ensure this header has finished being written. */
 		ASSERT(!HDR_L2_WRITING(hdr));
 		ASSERT(!HDR_L2_WRITE_HEAD(hdr));
 
 		if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict ||
 		    hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
 			/*
 			 * We've evicted to the target address,
 			 * or the end of the device.
 			 */
 			mutex_exit(hash_lock);
 			break;
 		}
 
 		if (!HDR_HAS_L1HDR(hdr)) {
 			ASSERT(!HDR_L2_READING(hdr));
 			/*
 			 * This doesn't exist in the ARC.  Destroy.
 			 * arc_hdr_destroy() will call list_remove()
 			 * and decrement arcstat_l2_lsize.
 			 */
 			arc_change_state(arc_anon, hdr);
 			arc_hdr_destroy(hdr);
 		} else {
 			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
 			ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
 			/*
 			 * Invalidate issued or about to be issued
 			 * reads, since we may be about to write
 			 * over this location.
 			 */
 			if (HDR_L2_READING(hdr)) {
 				ARCSTAT_BUMP(arcstat_l2_evict_reading);
 				arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
 			}
 
 			arc_hdr_l2hdr_destroy(hdr);
 		}
 		mutex_exit(hash_lock);
 	}
 	mutex_exit(&dev->l2ad_mtx);
 
 out:
 	/*
 	 * We need to check if we evict all buffers, otherwise we may iterate
 	 * unnecessarily.
 	 */
 	if (!all && rerun) {
 		/*
 		 * Bump device hand to the device start if it is approaching the
 		 * end. l2arc_evict() has already evicted ahead for this case.
 		 */
 		dev->l2ad_hand = dev->l2ad_start;
 		dev->l2ad_evict = dev->l2ad_start;
 		dev->l2ad_first = B_FALSE;
 		goto top;
 	}
 
 	if (!all) {
 		/*
 		 * In case of cache device removal (all) the following
 		 * assertions may be violated without functional consequences
 		 * as the device is about to be removed.
 		 */
 		ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
 		if (!dev->l2ad_first)
 			ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
 	}
 }
 
 /*
  * Handle any abd transforms that might be required for writing to the L2ARC.
  * If successful, this function will always return an abd with the data
  * transformed as it is on disk in a new abd of asize bytes.
  */
 static int
 l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
     abd_t **abd_out)
 {
 	int ret;
 	void *tmp = NULL;
 	abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
 	enum zio_compress compress = HDR_GET_COMPRESS(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	uint64_t size = arc_hdr_size(hdr);
 	boolean_t ismd = HDR_ISTYPE_METADATA(hdr);
 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 	dsl_crypto_key_t *dck = NULL;
 	uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 };
 	boolean_t no_crypt = B_FALSE;
 
 	ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) ||
 	    HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize);
 	ASSERT3U(psize, <=, asize);
 
 	/*
 	 * If this data simply needs its own buffer, we simply allocate it
 	 * and copy the data. This may be done to eliminate a dependency on a
 	 * shared buffer or to reallocate the buffer to match asize.
 	 */
 	if (HDR_HAS_RABD(hdr) && asize != psize) {
 		ASSERT3U(asize, >=, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
 		if (psize != asize)
 			abd_zero_off(to_write, psize, asize - psize);
 		goto out;
 	}
 
 	if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) &&
 	    !HDR_ENCRYPTED(hdr)) {
 		ASSERT3U(size, ==, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
 		if (size != asize)
 			abd_zero_off(to_write, size, asize - size);
 		goto out;
 	}
 
 	if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
 		/*
 		 * In some cases, we can wind up with size > asize, so
 		 * we need to opt for the larger allocation option here.
 		 *
 		 * (We also need abd_return_buf_copy in all cases because
 		 * it's an ASSERT() to modify the buffer before returning it
 		 * with arc_return_buf(), and all the compressors
 		 * write things before deciding to fail compression in nearly
 		 * every case.)
 		 */
 		uint64_t bufsize = MAX(size, asize);
 		cabd = abd_alloc_for_io(bufsize, ismd);
 		tmp = abd_borrow_buf(cabd, bufsize);
 
 		psize = zio_compress_data(compress, to_write, &tmp, size,
 		    hdr->b_complevel);
 
 		if (psize >= asize) {
 			psize = HDR_GET_PSIZE(hdr);
 			abd_return_buf_copy(cabd, tmp, bufsize);
 			HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
 			to_write = cabd;
 			abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize);
 			if (psize != asize)
 				abd_zero_off(to_write, psize, asize - psize);
 			goto encrypt;
 		}
 		ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr));
 		if (psize < asize)
 			memset((char *)tmp + psize, 0, bufsize - psize);
 		psize = HDR_GET_PSIZE(hdr);
 		abd_return_buf_copy(cabd, tmp, bufsize);
 		to_write = cabd;
 	}
 
 encrypt:
 	if (HDR_ENCRYPTED(hdr)) {
 		eabd = abd_alloc_for_io(asize, ismd);
 
 		/*
 		 * If the dataset was disowned before the buffer
 		 * made it to this point, the key to re-encrypt
 		 * it won't be available. In this case we simply
 		 * won't write the buffer to the L2ARC.
 		 */
 		ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj,
 		    FTAG, &dck);
 		if (ret != 0)
 			goto error;
 
 		ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
 		    hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd,
 		    &no_crypt);
 		if (ret != 0)
 			goto error;
 
 		if (no_crypt)
 			abd_copy(eabd, to_write, psize);
 
 		if (psize != asize)
 			abd_zero_off(eabd, psize, asize - psize);
 
 		/* assert that the MAC we got here matches the one we saved */
 		ASSERT0(memcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
 		spa_keystore_dsl_key_rele(spa, dck, FTAG);
 
 		if (to_write == cabd)
 			abd_free(cabd);
 
 		to_write = eabd;
 	}
 
 out:
 	ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd);
 	*abd_out = to_write;
 	return (0);
 
 error:
 	if (dck != NULL)
 		spa_keystore_dsl_key_rele(spa, dck, FTAG);
 	if (cabd != NULL)
 		abd_free(cabd);
 	if (eabd != NULL)
 		abd_free(eabd);
 
 	*abd_out = NULL;
 	return (ret);
 }
 
 static void
 l2arc_blk_fetch_done(zio_t *zio)
 {
 	l2arc_read_callback_t *cb;
 
 	cb = zio->io_private;
 	if (cb->l2rcb_abd != NULL)
 		abd_free(cb->l2rcb_abd);
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
 }
 
 /*
  * Find and write ARC buffers to the L2ARC device.
  *
  * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
  * for reading until they have completed writing.
  * The headroom_boost is an in-out parameter used to maintain headroom boost
  * state between calls to this function.
  *
  * Returns the number of bytes actually written (which may be smaller than
  * the delta by which the device hand has changed due to alignment and the
  * writing of log blocks).
  */
 static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
 	arc_buf_hdr_t 		*hdr, *hdr_prev, *head;
 	uint64_t 		write_asize, write_psize, write_lsize, headroom;
 	boolean_t		full;
 	l2arc_write_callback_t	*cb = NULL;
 	zio_t 			*pio, *wzio;
 	uint64_t 		guid = spa_load_guid(spa);
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 
 	ASSERT3P(dev->l2ad_vdev, !=, NULL);
 
 	pio = NULL;
 	write_lsize = write_asize = write_psize = 0;
 	full = B_FALSE;
 	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
 	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
 
 	/*
 	 * Copy buffers for L2ARC writing.
 	 */
 	for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
 		/*
 		 * If pass == 1 or 3, we cache MRU metadata and data
 		 * respectively.
 		 */
 		if (l2arc_mfuonly) {
 			if (pass == 1 || pass == 3)
 				continue;
 		}
 
 		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
 		uint64_t passed_sz = 0;
 
 		VERIFY3P(mls, !=, NULL);
 
 		/*
 		 * L2ARC fast warmup.
 		 *
 		 * Until the ARC is warm and starts to evict, read from the
 		 * head of the ARC lists rather than the tail.
 		 */
 		if (arc_warm == B_FALSE)
 			hdr = multilist_sublist_head(mls);
 		else
 			hdr = multilist_sublist_tail(mls);
 
 		headroom = target_sz * l2arc_headroom;
 		if (zfs_compressed_arc_enabled)
 			headroom = (headroom * l2arc_headroom_boost) / 100;
 
 		for (; hdr; hdr = hdr_prev) {
 			kmutex_t *hash_lock;
 			abd_t *to_write = NULL;
 
 			if (arc_warm == B_FALSE)
 				hdr_prev = multilist_sublist_next(mls, hdr);
 			else
 				hdr_prev = multilist_sublist_prev(mls, hdr);
 
 			hash_lock = HDR_LOCK(hdr);
 			if (!mutex_tryenter(hash_lock)) {
 				/*
 				 * Skip this buffer rather than waiting.
 				 */
 				continue;
 			}
 
 			passed_sz += HDR_GET_LSIZE(hdr);
 			if (l2arc_headroom != 0 && passed_sz > headroom) {
 				/*
 				 * Searched too far.
 				 */
 				mutex_exit(hash_lock);
 				break;
 			}
 
 			if (!l2arc_write_eligible(guid, hdr)) {
 				mutex_exit(hash_lock);
 				continue;
 			}
 
 			ASSERT(HDR_HAS_L1HDR(hdr));
 
 			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
 			ASSERT3U(arc_hdr_size(hdr), >, 0);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
 			    HDR_HAS_RABD(hdr));
 			uint64_t psize = HDR_GET_PSIZE(hdr);
 			uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
 			    psize);
 
 			/*
 			 * If the allocated size of this buffer plus the max
 			 * size for the pending log block exceeds the evicted
 			 * target size, terminate writing buffers for this run.
 			 */
 			if (write_asize + asize +
 			    sizeof (l2arc_log_blk_phys_t) > target_sz) {
 				full = B_TRUE;
 				mutex_exit(hash_lock);
 				break;
 			}
 
 			/*
 			 * We rely on the L1 portion of the header below, so
 			 * it's invalid for this header to have been evicted out
 			 * of the ghost cache, prior to being written out. The
 			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
 			 */
 			arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
 
 			/*
 			 * If this header has b_rabd, we can use this since it
 			 * must always match the data exactly as it exists on
 			 * disk. Otherwise, the L2ARC can normally use the
 			 * hdr's data, but if we're sharing data between the
 			 * hdr and one of its bufs, L2ARC needs its own copy of
 			 * the data so that the ZIO below can't race with the
 			 * buf consumer. To ensure that this copy will be
 			 * available for the lifetime of the ZIO and be cleaned
 			 * up afterwards, we add it to the l2arc_free_on_write
 			 * queue. If we need to apply any transforms to the
 			 * data (compression, encryption) we will also need the
 			 * extra buffer.
 			 */
 			if (HDR_HAS_RABD(hdr) && psize == asize) {
 				to_write = hdr->b_crypt_hdr.b_rabd;
 			} else if ((HDR_COMPRESSION_ENABLED(hdr) ||
 			    HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) &&
 			    !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) &&
 			    psize == asize) {
 				to_write = hdr->b_l1hdr.b_pabd;
 			} else {
 				int ret;
 				arc_buf_contents_t type = arc_buf_type(hdr);
 
 				ret = l2arc_apply_transforms(spa, hdr, asize,
 				    &to_write);
 				if (ret != 0) {
 					arc_hdr_clear_flags(hdr,
 					    ARC_FLAG_L2_WRITING);
 					mutex_exit(hash_lock);
 					continue;
 				}
 
 				l2arc_free_abd_on_write(to_write, asize, type);
 			}
 
 			if (pio == NULL) {
 				/*
 				 * Insert a dummy header on the buflist so
 				 * l2arc_write_done() can find where the
 				 * write buffers begin without searching.
 				 */
 				mutex_enter(&dev->l2ad_mtx);
 				list_insert_head(&dev->l2ad_buflist, head);
 				mutex_exit(&dev->l2ad_mtx);
 
 				cb = kmem_alloc(
 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
 				cb->l2wcb_dev = dev;
 				cb->l2wcb_head = head;
 				/*
 				 * Create a list to save allocated abd buffers
 				 * for l2arc_log_blk_commit().
 				 */
 				list_create(&cb->l2wcb_abd_list,
 				    sizeof (l2arc_lb_abd_buf_t),
 				    offsetof(l2arc_lb_abd_buf_t, node));
 				pio = zio_root(spa, l2arc_write_done, cb,
 				    ZIO_FLAG_CANFAIL);
 			}
 
 			hdr->b_l2hdr.b_dev = dev;
 			hdr->b_l2hdr.b_hits = 0;
 
 			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
 			hdr->b_l2hdr.b_arcs_state =
 			    hdr->b_l1hdr.b_state->arcs_state;
 			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
 
 			mutex_enter(&dev->l2ad_mtx);
 			list_insert_head(&dev->l2ad_buflist, hdr);
 			mutex_exit(&dev->l2ad_mtx);
 
 			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 			    arc_hdr_size(hdr), hdr);
 
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
 			    hdr->b_l2hdr.b_daddr, asize, to_write,
 			    ZIO_CHECKSUM_OFF, NULL, hdr,
 			    ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_CANFAIL, B_FALSE);
 
 			write_lsize += HDR_GET_LSIZE(hdr);
 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
 			    zio_t *, wzio);
 
 			write_psize += psize;
 			write_asize += asize;
 			dev->l2ad_hand += asize;
 			l2arc_hdr_arcstats_increment(hdr);
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 			mutex_exit(hash_lock);
 
 			/*
 			 * Append buf info to current log and commit if full.
 			 * arcstat_l2_{size,asize} kstats are updated
 			 * internally.
 			 */
 			if (l2arc_log_blk_insert(dev, hdr)) {
 				/*
 				 * l2ad_hand will be adjusted in
 				 * l2arc_log_blk_commit().
 				 */
 				write_asize +=
 				    l2arc_log_blk_commit(dev, pio, cb);
 			}
 
 			zio_nowait(wzio);
 		}
 
 		multilist_sublist_unlock(mls);
 
 		if (full == B_TRUE)
 			break;
 	}
 
 	/* No buffers selected for writing? */
 	if (pio == NULL) {
 		ASSERT0(write_lsize);
 		ASSERT(!HDR_HAS_L1HDR(head));
 		kmem_cache_free(hdr_l2only_cache, head);
 
 		/*
 		 * Although we did not write any buffers l2ad_evict may
 		 * have advanced.
 		 */
 		if (dev->l2ad_evict != l2dhdr->dh_evict)
 			l2arc_dev_hdr_update(dev);
 
 		return (0);
 	}
 
 	if (!dev->l2ad_first)
 		ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
 
 	ASSERT3U(write_asize, <=, target_sz);
 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
 	ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
 
 	dev->l2ad_writing = B_TRUE;
 	(void) zio_wait(pio);
 	dev->l2ad_writing = B_FALSE;
 
 	/*
 	 * Update the device header after the zio completes as
 	 * l2arc_write_done() may have updated the memory holding the log block
 	 * pointers in the device header.
 	 */
 	l2arc_dev_hdr_update(dev);
 
 	return (write_asize);
 }
 
 static boolean_t
 l2arc_hdr_limit_reached(void)
 {
 	int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size);
 
 	return (arc_reclaim_needed() ||
 	    (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100));
 }
 
 /*
  * This thread feeds the L2ARC at regular intervals.  This is the beating
  * heart of the L2ARC.
  */
 static  __attribute__((noreturn)) void
 l2arc_feed_thread(void *unused)
 {
 	(void) unused;
 	callb_cpr_t cpr;
 	l2arc_dev_t *dev;
 	spa_t *spa;
 	uint64_t size, wrote;
 	clock_t begin, next = ddi_get_lbolt();
 	fstrans_cookie_t cookie;
 
 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
 	mutex_enter(&l2arc_feed_thr_lock);
 
 	cookie = spl_fstrans_mark();
 	while (l2arc_thread_exit == 0) {
 		CALLB_CPR_SAFE_BEGIN(&cpr);
 		(void) cv_timedwait_idle(&l2arc_feed_thr_cv,
 		    &l2arc_feed_thr_lock, next);
 		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
 		next = ddi_get_lbolt() + hz;
 
 		/*
 		 * Quick check for L2ARC devices.
 		 */
 		mutex_enter(&l2arc_dev_mtx);
 		if (l2arc_ndev == 0) {
 			mutex_exit(&l2arc_dev_mtx);
 			continue;
 		}
 		mutex_exit(&l2arc_dev_mtx);
 		begin = ddi_get_lbolt();
 
 		/*
 		 * This selects the next l2arc device to write to, and in
 		 * doing so the next spa to feed from: dev->l2ad_spa.   This
 		 * will return NULL if there are now no l2arc devices or if
 		 * they are all faulted.
 		 *
 		 * If a device is returned, its spa's config lock is also
 		 * held to prevent device removal.  l2arc_dev_get_next()
 		 * will grab and release l2arc_dev_mtx.
 		 */
 		if ((dev = l2arc_dev_get_next()) == NULL)
 			continue;
 
 		spa = dev->l2ad_spa;
 		ASSERT3P(spa, !=, NULL);
 
 		/*
 		 * If the pool is read-only then force the feed thread to
 		 * sleep a little longer.
 		 */
 		if (!spa_writeable(spa)) {
 			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
 			spa_config_exit(spa, SCL_L2ARC, dev);
 			continue;
 		}
 
 		/*
 		 * Avoid contributing to memory pressure.
 		 */
 		if (l2arc_hdr_limit_reached()) {
 			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
 			spa_config_exit(spa, SCL_L2ARC, dev);
 			continue;
 		}
 
 		ARCSTAT_BUMP(arcstat_l2_feeds);
 
 		size = l2arc_write_size(dev);
 
 		/*
 		 * Evict L2ARC buffers that will be overwritten.
 		 */
 		l2arc_evict(dev, size, B_FALSE);
 
 		/*
 		 * Write ARC buffers.
 		 */
 		wrote = l2arc_write_buffers(spa, dev, size);
 
 		/*
 		 * Calculate interval between writes.
 		 */
 		next = l2arc_write_interval(begin, size, wrote);
 		spa_config_exit(spa, SCL_L2ARC, dev);
 	}
 	spl_fstrans_unmark(cookie);
 
 	l2arc_thread_exit = 0;
 	cv_broadcast(&l2arc_feed_thr_cv);
 	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
 	thread_exit();
 }
 
 boolean_t
 l2arc_vdev_present(vdev_t *vd)
 {
 	return (l2arc_vdev_get(vd) != NULL);
 }
 
 /*
  * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
  * the vdev_t isn't an L2ARC device.
  */
 l2arc_dev_t *
 l2arc_vdev_get(vdev_t *vd)
 {
 	l2arc_dev_t	*dev;
 
 	mutex_enter(&l2arc_dev_mtx);
 	for (dev = list_head(l2arc_dev_list); dev != NULL;
 	    dev = list_next(l2arc_dev_list, dev)) {
 		if (dev->l2ad_vdev == vd)
 			break;
 	}
 	mutex_exit(&l2arc_dev_mtx);
 
 	return (dev);
 }
 
 static void
 l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen)
 {
 	l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
 	uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 	spa_t *spa = dev->l2ad_spa;
 
 	/*
 	 * The L2ARC has to hold at least the payload of one log block for
 	 * them to be restored (persistent L2ARC). The payload of a log block
 	 * depends on the amount of its log entries. We always write log blocks
 	 * with 1022 entries. How many of them are committed or restored depends
 	 * on the size of the L2ARC device. Thus the maximum payload of
 	 * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device
 	 * is less than that, we reduce the amount of committed and restored
 	 * log entries per block so as to enable persistence.
 	 */
 	if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) {
 		dev->l2ad_log_entries = 0;
 	} else {
 		dev->l2ad_log_entries = MIN((dev->l2ad_end -
 		    dev->l2ad_start) >> SPA_MAXBLOCKSHIFT,
 		    L2ARC_LOG_BLK_MAX_ENTRIES);
 	}
 
 	/*
 	 * Read the device header, if an error is returned do not rebuild L2ARC.
 	 */
 	if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) {
 		/*
 		 * If we are onlining a cache device (vdev_reopen) that was
 		 * still present (l2arc_vdev_present()) and rebuild is enabled,
 		 * we should evict all ARC buffers and pointers to log blocks
 		 * and reclaim their space before restoring its contents to
 		 * L2ARC.
 		 */
 		if (reopen) {
 			if (!l2arc_rebuild_enabled) {
 				return;
 			} else {
 				l2arc_evict(dev, 0, B_TRUE);
 				/* start a new log block */
 				dev->l2ad_log_ent_idx = 0;
 				dev->l2ad_log_blk_payload_asize = 0;
 				dev->l2ad_log_blk_payload_start = 0;
 			}
 		}
 		/*
 		 * Just mark the device as pending for a rebuild. We won't
 		 * be starting a rebuild in line here as it would block pool
 		 * import. Instead spa_load_impl will hand that off to an
 		 * async task which will call l2arc_spa_rebuild_start.
 		 */
 		dev->l2ad_rebuild = B_TRUE;
 	} else if (spa_writeable(spa)) {
 		/*
 		 * In this case TRIM the whole device if l2arc_trim_ahead > 0,
 		 * otherwise create a new header. We zero out the memory holding
 		 * the header to reset dh_start_lbps. If we TRIM the whole
 		 * device the new header will be written by
 		 * vdev_trim_l2arc_thread() at the end of the TRIM to update the
 		 * trim_state in the header too. When reading the header, if
 		 * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0
 		 * we opt to TRIM the whole device again.
 		 */
 		if (l2arc_trim_ahead > 0) {
 			dev->l2ad_trim_all = B_TRUE;
 		} else {
 			memset(l2dhdr, 0, l2dhdr_asize);
 			l2arc_dev_hdr_update(dev);
 		}
 	}
 }
 
 /*
  * Add a vdev for use by the L2ARC.  By this point the spa has already
  * validated the vdev and opened it.
  */
 void
 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 {
 	l2arc_dev_t		*adddev;
 	uint64_t		l2dhdr_asize;
 
 	ASSERT(!l2arc_vdev_present(vd));
 
 	/*
 	 * Create a new l2arc device entry.
 	 */
 	adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
 	adddev->l2ad_spa = spa;
 	adddev->l2ad_vdev = vd;
 	/* leave extra size for an l2arc device header */
 	l2dhdr_asize = adddev->l2ad_dev_hdr_asize =
 	    MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift);
 	adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize;
 	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
 	ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end);
 	adddev->l2ad_hand = adddev->l2ad_start;
 	adddev->l2ad_evict = adddev->l2ad_start;
 	adddev->l2ad_first = B_TRUE;
 	adddev->l2ad_writing = B_FALSE;
 	adddev->l2ad_trim_all = B_FALSE;
 	list_link_init(&adddev->l2ad_node);
 	adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
 
 	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
 	/*
 	 * This is a list of all ARC buffers that are still valid on the
 	 * device.
 	 */
 	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
 
 	/*
 	 * This is a list of pointers to log blocks that are still present
 	 * on the device.
 	 */
 	list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t),
 	    offsetof(l2arc_lb_ptr_buf_t, node));
 
 	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
 	zfs_refcount_create(&adddev->l2ad_alloc);
 	zfs_refcount_create(&adddev->l2ad_lb_asize);
 	zfs_refcount_create(&adddev->l2ad_lb_count);
 
 	/*
 	 * Decide if dev is eligible for L2ARC rebuild or whole device
 	 * trimming. This has to happen before the device is added in the
 	 * cache device list and l2arc_dev_mtx is released. Otherwise
 	 * l2arc_feed_thread() might already start writing on the
 	 * device.
 	 */
 	l2arc_rebuild_dev(adddev, B_FALSE);
 
 	/*
 	 * Add device to global list
 	 */
 	mutex_enter(&l2arc_dev_mtx);
 	list_insert_head(l2arc_dev_list, adddev);
 	atomic_inc_64(&l2arc_ndev);
 	mutex_exit(&l2arc_dev_mtx);
 }
 
 /*
  * Decide if a vdev is eligible for L2ARC rebuild, called from vdev_reopen()
  * in case of onlining a cache device.
  */
 void
 l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
 {
 	l2arc_dev_t		*dev = NULL;
 
 	dev = l2arc_vdev_get(vd);
 	ASSERT3P(dev, !=, NULL);
 
 	/*
 	 * In contrast to l2arc_add_vdev() we do not have to worry about
 	 * l2arc_feed_thread() invalidating previous content when onlining a
 	 * cache device. The device parameters (l2ad*) are not cleared when
 	 * offlining the device and writing new buffers will not invalidate
 	 * all previous content. In worst case only buffers that have not had
 	 * their log block written to the device will be lost.
 	 * When onlining the cache device (ie offline->online without exporting
 	 * the pool in between) this happens:
 	 * vdev_reopen() -> vdev_open() -> l2arc_rebuild_vdev()
 	 * 			|			|
 	 * 		vdev_is_dead() = B_FALSE	l2ad_rebuild = B_TRUE
 	 * During the time where vdev_is_dead = B_FALSE and until l2ad_rebuild
 	 * is set to B_TRUE we might write additional buffers to the device.
 	 */
 	l2arc_rebuild_dev(dev, reopen);
 }
 
 /*
  * Remove a vdev from the L2ARC.
  */
 void
 l2arc_remove_vdev(vdev_t *vd)
 {
 	l2arc_dev_t *remdev = NULL;
 
 	/*
 	 * Find the device by vdev
 	 */
 	remdev = l2arc_vdev_get(vd);
 	ASSERT3P(remdev, !=, NULL);
 
 	/*
 	 * Cancel any ongoing or scheduled rebuild.
 	 */
 	mutex_enter(&l2arc_rebuild_thr_lock);
 	if (remdev->l2ad_rebuild_began == B_TRUE) {
 		remdev->l2ad_rebuild_cancel = B_TRUE;
 		while (remdev->l2ad_rebuild == B_TRUE)
 			cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
 	}
 	mutex_exit(&l2arc_rebuild_thr_lock);
 
 	/*
 	 * Remove device from global list
 	 */
 	mutex_enter(&l2arc_dev_mtx);
 	list_remove(l2arc_dev_list, remdev);
 	l2arc_dev_last = NULL;		/* may have been invalidated */
 	atomic_dec_64(&l2arc_ndev);
 	mutex_exit(&l2arc_dev_mtx);
 
 	/*
 	 * Clear all buflists and ARC references.  L2ARC device flush.
 	 */
 	l2arc_evict(remdev, 0, B_TRUE);
 	list_destroy(&remdev->l2ad_buflist);
 	ASSERT(list_is_empty(&remdev->l2ad_lbptr_list));
 	list_destroy(&remdev->l2ad_lbptr_list);
 	mutex_destroy(&remdev->l2ad_mtx);
 	zfs_refcount_destroy(&remdev->l2ad_alloc);
 	zfs_refcount_destroy(&remdev->l2ad_lb_asize);
 	zfs_refcount_destroy(&remdev->l2ad_lb_count);
 	kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
 	vmem_free(remdev, sizeof (l2arc_dev_t));
 }
 
 void
 l2arc_init(void)
 {
 	l2arc_thread_exit = 0;
 	l2arc_ndev = 0;
 
 	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	l2arc_dev_list = &L2ARC_dev_list;
 	l2arc_free_on_write = &L2ARC_free_on_write;
 	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
 	    offsetof(l2arc_dev_t, l2ad_node));
 	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
 	    offsetof(l2arc_data_free_t, l2df_list_node));
 }
 
 void
 l2arc_fini(void)
 {
 	mutex_destroy(&l2arc_feed_thr_lock);
 	cv_destroy(&l2arc_feed_thr_cv);
 	mutex_destroy(&l2arc_rebuild_thr_lock);
 	cv_destroy(&l2arc_rebuild_thr_cv);
 	mutex_destroy(&l2arc_dev_mtx);
 	mutex_destroy(&l2arc_free_on_write_mtx);
 
 	list_destroy(l2arc_dev_list);
 	list_destroy(l2arc_free_on_write);
 }
 
 void
 l2arc_start(void)
 {
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return;
 
 	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
 	    TS_RUN, defclsyspri);
 }
 
 void
 l2arc_stop(void)
 {
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return;
 
 	mutex_enter(&l2arc_feed_thr_lock);
 	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
 	l2arc_thread_exit = 1;
 	while (l2arc_thread_exit != 0)
 		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
 	mutex_exit(&l2arc_feed_thr_lock);
 }
 
 /*
  * Punches out rebuild threads for the L2ARC devices in a spa. This should
  * be called after pool import from the spa async thread, since starting
  * these threads directly from spa_import() will make them part of the
  * "zpool import" context and delay process exit (and thus pool import).
  */
 void
 l2arc_spa_rebuild_start(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	/*
 	 * Locate the spa's l2arc devices and kick off rebuild threads.
 	 */
 	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		l2arc_dev_t *dev =
 		    l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
 		if (dev == NULL) {
 			/* Don't attempt a rebuild if the vdev is UNAVAIL */
 			continue;
 		}
 		mutex_enter(&l2arc_rebuild_thr_lock);
 		if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) {
 			dev->l2ad_rebuild_began = B_TRUE;
 			(void) thread_create(NULL, 0, l2arc_dev_rebuild_thread,
 			    dev, 0, &p0, TS_RUN, minclsyspri);
 		}
 		mutex_exit(&l2arc_rebuild_thr_lock);
 	}
 }
 
 /*
  * Main entry point for L2ARC rebuilding.
  */
 static __attribute__((noreturn)) void
 l2arc_dev_rebuild_thread(void *arg)
 {
 	l2arc_dev_t *dev = arg;
 
 	VERIFY(!dev->l2ad_rebuild_cancel);
 	VERIFY(dev->l2ad_rebuild);
 	(void) l2arc_rebuild(dev);
 	mutex_enter(&l2arc_rebuild_thr_lock);
 	dev->l2ad_rebuild_began = B_FALSE;
 	dev->l2ad_rebuild = B_FALSE;
 	mutex_exit(&l2arc_rebuild_thr_lock);
 
 	thread_exit();
 }
 
 /*
  * This function implements the actual L2ARC metadata rebuild. It:
  * starts reading the log block chain and restores each block's contents
  * to memory (reconstructing arc_buf_hdr_t's).
  *
  * Operation stops under any of the following conditions:
  *
  * 1) We reach the end of the log block chain.
  * 2) We encounter *any* error condition (cksum errors, io errors)
  */
 static int
 l2arc_rebuild(l2arc_dev_t *dev)
 {
 	vdev_t			*vd = dev->l2ad_vdev;
 	spa_t			*spa = vd->vdev_spa;
 	int			err = 0;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	l2arc_log_blk_phys_t	*this_lb, *next_lb;
 	zio_t			*this_io = NULL, *next_io = NULL;
 	l2arc_log_blkptr_t	lbps[2];
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 	boolean_t		lock_held;
 
 	this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP);
 	next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP);
 
 	/*
 	 * We prevent device removal while issuing reads to the device,
 	 * then during the rebuilding phases we drop this lock again so
 	 * that a spa_unload or device remove can be initiated - this is
 	 * safe, because the spa will signal us to stop before removing
 	 * our device and wait for us to stop.
 	 */
 	spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
 	lock_held = B_TRUE;
 
 	/*
 	 * Retrieve the persistent L2ARC device state.
 	 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 	 */
 	dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start);
 	dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr +
 	    L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop),
 	    dev->l2ad_start);
 	dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
 
 	vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time;
 	vd->vdev_trim_state = l2dhdr->dh_trim_state;
 
 	/*
 	 * In case the zfs module parameter l2arc_rebuild_enabled is false
 	 * we do not start the rebuild process.
 	 */
 	if (!l2arc_rebuild_enabled)
 		goto out;
 
 	/* Prepare the rebuild process */
 	memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps));
 
 	/* Start the rebuild process */
 	for (;;) {
 		if (!l2arc_log_blkptr_valid(dev, &lbps[0]))
 			break;
 
 		if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1],
 		    this_lb, next_lb, this_io, &next_io)) != 0)
 			goto out;
 
 		/*
 		 * Our memory pressure valve. If the system is running low
 		 * on memory, rather than swamping memory with new ARC buf
 		 * hdrs, we opt not to rebuild the L2ARC. At this point,
 		 * however, we have already set up our L2ARC dev to chain in
 		 * new metadata log blocks, so the user may choose to offline/
 		 * online the L2ARC dev at a later time (or re-import the pool)
 		 * to reconstruct it (when there's less memory pressure).
 		 */
 		if (l2arc_hdr_limit_reached()) {
 			ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
 			cmn_err(CE_NOTE, "System running low on memory, "
 			    "aborting L2ARC rebuild.");
 			err = SET_ERROR(ENOMEM);
 			goto out;
 		}
 
 		spa_config_exit(spa, SCL_L2ARC, vd);
 		lock_held = B_FALSE;
 
 		/*
 		 * Now that we know that the next_lb checks out alright, we
 		 * can start reconstruction from this log block.
 		 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 		 */
 		uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
 		l2arc_log_blk_restore(dev, this_lb, asize);
 
 		/*
 		 * log block restored, include its pointer in the list of
 		 * pointers to log blocks present in the L2ARC device.
 		 */
 		lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
 		lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t),
 		    KM_SLEEP);
 		memcpy(lb_ptr_buf->lb_ptr, &lbps[0],
 		    sizeof (l2arc_log_blkptr_t));
 		mutex_enter(&dev->l2ad_mtx);
 		list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
 		ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
 		ARCSTAT_BUMP(arcstat_l2_log_blk_count);
 		zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
 		zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
 		mutex_exit(&dev->l2ad_mtx);
 		vdev_space_update(vd, asize, 0, 0);
 
 		/*
 		 * Protection against loops of log blocks:
 		 *
 		 *				       l2ad_hand  l2ad_evict
 		 *                                         V	      V
 		 * l2ad_start |=======================================| l2ad_end
 		 *             -----|||----|||---|||----|||
 		 *                  (3)    (2)   (1)    (0)
 		 *             ---|||---|||----|||---|||
 		 *		  (7)   (6)    (5)   (4)
 		 *
 		 * In this situation the pointer of log block (4) passes
 		 * l2arc_log_blkptr_valid() but the log block should not be
 		 * restored as it is overwritten by the payload of log block
 		 * (0). Only log blocks (0)-(3) should be restored. We check
 		 * whether l2ad_evict lies in between the payload starting
 		 * offset of the next log block (lbps[1].lbp_payload_start)
 		 * and the payload starting offset of the present log block
 		 * (lbps[0].lbp_payload_start). If true and this isn't the
 		 * first pass, we are looping from the beginning and we should
 		 * stop.
 		 */
 		if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
 		    lbps[0].lbp_payload_start, dev->l2ad_evict) &&
 		    !dev->l2ad_first)
 			goto out;
 
 		kpreempt(KPREEMPT_SYNC);
 		for (;;) {
 			mutex_enter(&l2arc_rebuild_thr_lock);
 			if (dev->l2ad_rebuild_cancel) {
 				dev->l2ad_rebuild = B_FALSE;
 				cv_signal(&l2arc_rebuild_thr_cv);
 				mutex_exit(&l2arc_rebuild_thr_lock);
 				err = SET_ERROR(ECANCELED);
 				goto out;
 			}
 			mutex_exit(&l2arc_rebuild_thr_lock);
 			if (spa_config_tryenter(spa, SCL_L2ARC, vd,
 			    RW_READER)) {
 				lock_held = B_TRUE;
 				break;
 			}
 			/*
 			 * L2ARC config lock held by somebody in writer,
 			 * possibly due to them trying to remove us. They'll
 			 * likely to want us to shut down, so after a little
 			 * delay, we check l2ad_rebuild_cancel and retry
 			 * the lock again.
 			 */
 			delay(1);
 		}
 
 		/*
 		 * Continue with the next log block.
 		 */
 		lbps[0] = lbps[1];
 		lbps[1] = this_lb->lb_prev_lbp;
 		PTR_SWAP(this_lb, next_lb);
 		this_io = next_io;
 		next_io = NULL;
 	}
 
 	if (this_io != NULL)
 		l2arc_log_blk_fetch_abort(this_io);
 out:
 	if (next_io != NULL)
 		l2arc_log_blk_fetch_abort(next_io);
 	vmem_free(this_lb, sizeof (*this_lb));
 	vmem_free(next_lb, sizeof (*next_lb));
 
 	if (!l2arc_rebuild_enabled) {
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "disabled");
 	} else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_success);
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "successful, restored %llu blocks",
 		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 	} else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) {
 		/*
 		 * No error but also nothing restored, meaning the lbps array
 		 * in the device header points to invalid/non-present log
 		 * blocks. Reset the header.
 		 */
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "no valid log blocks");
 		memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize);
 		l2arc_dev_hdr_update(dev);
 	} else if (err == ECANCELED) {
 		/*
 		 * In case the rebuild was canceled do not log to spa history
 		 * log as the pool may be in the process of being removed.
 		 */
 		zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks",
 		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 	} else if (err != 0) {
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "aborted, restored %llu blocks",
 		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 	}
 
 	if (lock_held)
 		spa_config_exit(spa, SCL_L2ARC, vd);
 
 	return (err);
 }
 
 /*
  * Attempts to read the device header on the provided L2ARC device and writes
  * it to `hdr'. On success, this function returns 0, otherwise the appropriate
  * error code is returned.
  */
 static int
 l2arc_dev_hdr_read(l2arc_dev_t *dev)
 {
 	int			err;
 	uint64_t		guid;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	const uint64_t		l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 	abd_t 			*abd;
 
 	guid = spa_guid(dev->l2ad_vdev->vdev_spa);
 
 	abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
 
 	err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
 	    VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
 	    ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
 	    ZIO_FLAG_SPECULATIVE, B_FALSE));
 
 	abd_free(abd);
 
 	if (err != 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors);
 		zfs_dbgmsg("L2ARC IO error (%d) while reading device header, "
 		    "vdev guid: %llu", err,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid);
 		return (err);
 	}
 
 	if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
 		byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr));
 
 	if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC ||
 	    l2dhdr->dh_spa_guid != guid ||
 	    l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid ||
 	    l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION ||
 	    l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
 	    l2dhdr->dh_end != dev->l2ad_end ||
 	    !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
 	    l2dhdr->dh_evict) ||
 	    (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE &&
 	    l2arc_trim_ahead > 0)) {
 		/*
 		 * Attempt to rebuild a device containing no actual dev hdr
 		 * or containing a header from some other pool or from another
 		 * version of persistent L2ARC.
 		 */
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	return (0);
 }
 
 /*
  * Reads L2ARC log blocks from storage and validates their contents.
  *
  * This function implements a simple fetcher to make sure that while
  * we're processing one buffer the L2ARC is already fetching the next
  * one in the chain.
  *
  * The arguments this_lp and next_lp point to the current and next log block
  * address in the block chain. Similarly, this_lb and next_lb hold the
  * l2arc_log_blk_phys_t's of the current and next L2ARC blk.
  *
  * The `this_io' and `next_io' arguments are used for block fetching.
  * When issuing the first blk IO during rebuild, you should pass NULL for
  * `this_io'. This function will then issue a sync IO to read the block and
  * also issue an async IO to fetch the next block in the block chain. The
  * fetched IO is returned in `next_io'. On subsequent calls to this
  * function, pass the value returned in `next_io' from the previous call
  * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO.
  * Prior to the call, you should initialize your `next_io' pointer to be
  * NULL. If no fetch IO was issued, the pointer is left set at NULL.
  *
  * On success, this function returns 0, otherwise it returns an appropriate
  * error code. On error the fetching IO is aborted and cleared before
  * returning from this function. Therefore, if we return `success', the
  * caller can assume that we have taken care of cleanup of fetch IOs.
  */
 static int
 l2arc_log_blk_read(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp,
     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
     zio_t *this_io, zio_t **next_io)
 {
 	int		err = 0;
 	zio_cksum_t	cksum;
 	abd_t		*abd = NULL;
 	uint64_t	asize;
 
 	ASSERT(this_lbp != NULL && next_lbp != NULL);
 	ASSERT(this_lb != NULL && next_lb != NULL);
 	ASSERT(next_io != NULL && *next_io == NULL);
 	ASSERT(l2arc_log_blkptr_valid(dev, this_lbp));
 
 	/*
 	 * Check to see if we have issued the IO for this log block in a
 	 * previous run. If not, this is the first call, so issue it now.
 	 */
 	if (this_io == NULL) {
 		this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp,
 		    this_lb);
 	}
 
 	/*
 	 * Peek to see if we can start issuing the next IO immediately.
 	 */
 	if (l2arc_log_blkptr_valid(dev, next_lbp)) {
 		/*
 		 * Start issuing IO for the next log block early - this
 		 * should help keep the L2ARC device busy while we
 		 * decompress and restore this log block.
 		 */
 		*next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp,
 		    next_lb);
 	}
 
 	/* Wait for the IO to read this log block to complete */
 	if ((err = zio_wait(this_io)) != 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
 		zfs_dbgmsg("L2ARC IO error (%d) while reading log block, "
 		    "offset: %llu, vdev guid: %llu", err,
 		    (u_longlong_t)this_lbp->lbp_daddr,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid);
 		goto cleanup;
 	}
 
 	/*
 	 * Make sure the buffer checks out.
 	 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 	 */
 	asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop);
 	fletcher_4_native(this_lb, asize, NULL, &cksum);
 	if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors);
 		zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, "
 		    "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu",
 		    (u_longlong_t)this_lbp->lbp_daddr,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid,
 		    (u_longlong_t)dev->l2ad_hand,
 		    (u_longlong_t)dev->l2ad_evict);
 		err = SET_ERROR(ECKSUM);
 		goto cleanup;
 	}
 
 	/* Now we can take our time decoding this buffer */
 	switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
 	case ZIO_COMPRESS_OFF:
 		break;
 	case ZIO_COMPRESS_LZ4:
 		abd = abd_alloc_for_io(asize, B_TRUE);
 		abd_copy_from_buf_off(abd, this_lb, 0, asize);
 		if ((err = zio_decompress_data(
 		    L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
 		    abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) {
 			err = SET_ERROR(EINVAL);
 			goto cleanup;
 		}
 		break;
 	default:
 		err = SET_ERROR(EINVAL);
 		goto cleanup;
 	}
 	if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
 		byteswap_uint64_array(this_lb, sizeof (*this_lb));
 	if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) {
 		err = SET_ERROR(EINVAL);
 		goto cleanup;
 	}
 cleanup:
 	/* Abort an in-flight fetch I/O in case of error */
 	if (err != 0 && *next_io != NULL) {
 		l2arc_log_blk_fetch_abort(*next_io);
 		*next_io = NULL;
 	}
 	if (abd != NULL)
 		abd_free(abd);
 	return (err);
 }
 
 /*
  * Restores the payload of a log block to ARC. This creates empty ARC hdr
  * entries which only contain an l2arc hdr, essentially restoring the
  * buffers to their L2ARC evicted state. This function also updates space
  * usage on the L2ARC vdev to make sure it tracks restored buffers.
  */
 static void
 l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
     uint64_t lb_asize)
 {
 	uint64_t	size = 0, asize = 0;
 	uint64_t	log_entries = dev->l2ad_log_entries;
 
 	/*
 	 * Usually arc_adapt() is called only for data, not headers, but
 	 * since we may allocate significant amount of memory here, let ARC
 	 * grow its arc_c.
 	 */
 	arc_adapt(log_entries * HDR_L2ONLY_SIZE);
 
 	for (int i = log_entries - 1; i >= 0; i--) {
 		/*
 		 * Restore goes in the reverse temporal direction to preserve
 		 * correct temporal ordering of buffers in the l2ad_buflist.
 		 * l2arc_hdr_restore also does a list_insert_tail instead of
 		 * list_insert_head on the l2ad_buflist:
 		 *
 		 *		LIST	l2ad_buflist		LIST
 		 *		HEAD  <------ (time) ------	TAIL
 		 * direction	+-----+-----+-----+-----+-----+    direction
 		 * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild
 		 * fill		+-----+-----+-----+-----+-----+
 		 *		^				^
 		 *		|				|
 		 *		|				|
 		 *	l2arc_feed_thread		l2arc_rebuild
 		 *	will place new bufs here	restores bufs here
 		 *
 		 * During l2arc_rebuild() the device is not used by
 		 * l2arc_feed_thread() as dev->l2ad_rebuild is set to true.
 		 */
 		size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop);
 		asize += vdev_psize_to_asize(dev->l2ad_vdev,
 		    L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop));
 		l2arc_hdr_restore(&lb->lb_entries[i], dev);
 	}
 
 	/*
 	 * Record rebuild stats:
 	 *	size		Logical size of restored buffers in the L2ARC
 	 *	asize		Aligned size of restored buffers in the L2ARC
 	 */
 	ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
 	ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize);
 	ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries);
 	ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize);
 	ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize);
 	ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
 }
 
 /*
  * Restores a single ARC buf hdr from a log entry. The ARC buffer is put
  * into a state indicating that it has been evicted to L2ARC.
  */
 static void
 l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
 {
 	arc_buf_hdr_t		*hdr, *exists;
 	kmutex_t		*hash_lock;
 	arc_buf_contents_t	type = L2BLK_GET_TYPE((le)->le_prop);
 	uint64_t		asize;
 
 	/*
 	 * Do all the allocation before grabbing any locks, this lets us
 	 * sleep if memory is full and we don't have to deal with failed
 	 * allocations.
 	 */
 	hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type,
 	    dev, le->le_dva, le->le_daddr,
 	    L2BLK_GET_PSIZE((le)->le_prop), le->le_birth,
 	    L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel,
 	    L2BLK_GET_PROTECTED((le)->le_prop),
 	    L2BLK_GET_PREFETCH((le)->le_prop),
 	    L2BLK_GET_STATE((le)->le_prop));
 	asize = vdev_psize_to_asize(dev->l2ad_vdev,
 	    L2BLK_GET_PSIZE((le)->le_prop));
 
 	/*
 	 * vdev_space_update() has to be called before arc_hdr_destroy() to
 	 * avoid underflow since the latter also calls vdev_space_update().
 	 */
 	l2arc_hdr_arcstats_increment(hdr);
 	vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 	mutex_enter(&dev->l2ad_mtx);
 	list_insert_tail(&dev->l2ad_buflist, hdr);
 	(void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
 	mutex_exit(&dev->l2ad_mtx);
 
 	exists = buf_hash_insert(hdr, &hash_lock);
 	if (exists) {
 		/* Buffer was already cached, no need to restore it. */
 		arc_hdr_destroy(hdr);
 		/*
 		 * If the buffer is already cached, check whether it has
 		 * L2ARC metadata. If not, enter them and update the flag.
 		 * This is important is case of onlining a cache device, since
 		 * we previously evicted all L2ARC metadata from ARC.
 		 */
 		if (!HDR_HAS_L2HDR(exists)) {
 			arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR);
 			exists->b_l2hdr.b_dev = dev;
 			exists->b_l2hdr.b_daddr = le->le_daddr;
 			exists->b_l2hdr.b_arcs_state =
 			    L2BLK_GET_STATE((le)->le_prop);
 			mutex_enter(&dev->l2ad_mtx);
 			list_insert_tail(&dev->l2ad_buflist, exists);
 			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 			    arc_hdr_size(exists), exists);
 			mutex_exit(&dev->l2ad_mtx);
 			l2arc_hdr_arcstats_increment(exists);
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 		}
 		ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
 	}
 
 	mutex_exit(hash_lock);
 }
 
 /*
  * Starts an asynchronous read IO to read a log block. This is used in log
  * block reconstruction to start reading the next block before we are done
  * decoding and reconstructing the current block, to keep the l2arc device
  * nice and hot with read IO to process.
  * The returned zio will contain a newly allocated memory buffers for the IO
  * data which should then be freed by the caller once the zio is no longer
  * needed (i.e. due to it having completed). If you wish to abort this
  * zio, you should do so using l2arc_log_blk_fetch_abort, which takes
  * care of disposing of the allocated buffers correctly.
  */
 static zio_t *
 l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
     l2arc_log_blk_phys_t *lb)
 {
 	uint32_t		asize;
 	zio_t			*pio;
 	l2arc_read_callback_t	*cb;
 
 	/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 	asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
 	ASSERT(asize <= sizeof (l2arc_log_blk_phys_t));
 
 	cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
 	cb->l2rcb_abd = abd_get_from_buf(lb, asize);
 	pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
 	(void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
 	    cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
 
 	return (pio);
 }
 
 /*
  * Aborts a zio returned from l2arc_log_blk_fetch and frees the data
  * buffers allocated for it.
  */
 static void
 l2arc_log_blk_fetch_abort(zio_t *zio)
 {
 	(void) zio_wait(zio);
 }
 
 /*
  * Creates a zio to update the device header on an l2arc device.
  */
 void
 l2arc_dev_hdr_update(l2arc_dev_t *dev)
 {
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	const uint64_t		l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 	abd_t			*abd;
 	int			err;
 
 	VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER));
 
 	l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
 	l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION;
 	l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
 	l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid;
 	l2dhdr->dh_log_entries = dev->l2ad_log_entries;
 	l2dhdr->dh_evict = dev->l2ad_evict;
 	l2dhdr->dh_start = dev->l2ad_start;
 	l2dhdr->dh_end = dev->l2ad_end;
 	l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
 	l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
 	l2dhdr->dh_flags = 0;
 	l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time;
 	l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state;
 	if (dev->l2ad_first)
 		l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
 
 	abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
 
 	err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev,
 	    VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL,
 	    NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE));
 
 	abd_free(abd);
 
 	if (err != 0) {
 		zfs_dbgmsg("L2ARC IO error (%d) while writing device header, "
 		    "vdev guid: %llu", err,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid);
 	}
 }
 
 /*
  * Commits a log block to the L2ARC device. This routine is invoked from
  * l2arc_write_buffers when the log block fills up.
  * This function allocates some memory to temporarily hold the serialized
  * buffer to be written. This is then released in l2arc_write_done.
  */
 static uint64_t
 l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 {
 	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	uint64_t		psize, asize;
 	zio_t			*wzio;
 	l2arc_lb_abd_buf_t	*abd_buf;
 	uint8_t			*tmpbuf = NULL;
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 
 	VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
 
 	abd_buf = zio_buf_alloc(sizeof (*abd_buf));
 	abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb));
 	lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
 	lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP);
 
 	/* link the buffer into the block chain */
 	lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1];
 	lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
 
 	/*
 	 * l2arc_log_blk_commit() may be called multiple times during a single
 	 * l2arc_write_buffers() call. Save the allocated abd buffers in a list
 	 * so we can free them in l2arc_write_done() later on.
 	 */
 	list_insert_tail(&cb->l2wcb_abd_list, abd_buf);
 
 	/* try to compress the buffer */
 	psize = zio_compress_data(ZIO_COMPRESS_LZ4,
 	    abd_buf->abd, (void **) &tmpbuf, sizeof (*lb), 0);
 
 	/* a log block is never entirely zero */
 	ASSERT(psize != 0);
 	asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
 	ASSERT(asize <= sizeof (*lb));
 
 	/*
 	 * Update the start log block pointer in the device header to point
 	 * to the log block we're about to write.
 	 */
 	l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0];
 	l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand;
 	l2dhdr->dh_start_lbps[0].lbp_payload_asize =
 	    dev->l2ad_log_blk_payload_asize;
 	l2dhdr->dh_start_lbps[0].lbp_payload_start =
 	    dev->l2ad_log_blk_payload_start;
 	L2BLK_SET_LSIZE(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb));
 	L2BLK_SET_PSIZE(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize);
 	L2BLK_SET_CHECKSUM(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 	    ZIO_CHECKSUM_FLETCHER_4);
 	if (asize < sizeof (*lb)) {
 		/* compression succeeded */
 		memset(tmpbuf + psize, 0, asize - psize);
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_LZ4);
 	} else {
 		/* compression failed */
 		memcpy(tmpbuf, lb, sizeof (*lb));
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_OFF);
 	}
 
 	/* checksum what we're about to write */
 	fletcher_4_native(tmpbuf, asize, NULL,
 	    &l2dhdr->dh_start_lbps[0].lbp_cksum);
 
 	abd_free(abd_buf->abd);
 
 	/* perform the write itself */
 	abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));
 	abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
 	wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
 	    asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
 	DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
 	(void) zio_nowait(wzio);
 
 	dev->l2ad_hand += asize;
 	/*
 	 * Include the committed log block's pointer  in the list of pointers
 	 * to log blocks present in the L2ARC device.
 	 */
 	memcpy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[0],
 	    sizeof (l2arc_log_blkptr_t));
 	mutex_enter(&dev->l2ad_mtx);
 	list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
 	ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
 	ARCSTAT_BUMP(arcstat_l2_log_blk_count);
 	zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
 	zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
 	mutex_exit(&dev->l2ad_mtx);
 	vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 	/* bump the kstats */
 	ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
 	ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
 	ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize);
 	ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
 	    dev->l2ad_log_blk_payload_asize / asize);
 
 	/* start a new log block */
 	dev->l2ad_log_ent_idx = 0;
 	dev->l2ad_log_blk_payload_asize = 0;
 	dev->l2ad_log_blk_payload_start = 0;
 
 	return (asize);
 }
 
 /*
  * Validates an L2ARC log block address to make sure that it can be read
  * from the provided L2ARC device.
  */
 boolean_t
 l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
 {
 	/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 	uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
 	uint64_t end = lbp->lbp_daddr + asize - 1;
 	uint64_t start = lbp->lbp_payload_start;
 	boolean_t evicted = B_FALSE;
 
 	/*
 	 * A log block is valid if all of the following conditions are true:
 	 * - it fits entirely (including its payload) between l2ad_start and
 	 *   l2ad_end
 	 * - it has a valid size
 	 * - neither the log block itself nor part of its payload was evicted
 	 *   by l2arc_evict():
 	 *
 	 *		l2ad_hand          l2ad_evict
 	 *		|			 |	lbp_daddr
 	 *		|     start		 |	|  end
 	 *		|     |			 |	|  |
 	 *		V     V		         V	V  V
 	 *   l2ad_start ============================================ l2ad_end
 	 *                    --------------------------||||
 	 *				^		 ^
 	 *				|		log block
 	 *				payload
 	 */
 
 	evicted =
 	    l2arc_range_check_overlap(start, end, dev->l2ad_hand) ||
 	    l2arc_range_check_overlap(start, end, dev->l2ad_evict) ||
 	    l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) ||
 	    l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end);
 
 	return (start >= dev->l2ad_start && end <= dev->l2ad_end &&
 	    asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) &&
 	    (!evicted || dev->l2ad_first));
 }
 
 /*
  * Inserts ARC buffer header `hdr' into the current L2ARC log block on
  * the device. The buffer being inserted must be present in L2ARC.
  * Returns B_TRUE if the L2ARC log block is full and needs to be committed
  * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
  */
 static boolean_t
 l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
 {
 	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk;
 	l2arc_log_ent_phys_t	*le;
 
 	if (dev->l2ad_log_entries == 0)
 		return (B_FALSE);
 
 	int index = dev->l2ad_log_ent_idx++;
 
 	ASSERT3S(index, <, dev->l2ad_log_entries);
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	le = &lb->lb_entries[index];
 	memset(le, 0, sizeof (*le));
 	le->le_dva = hdr->b_dva;
 	le->le_birth = hdr->b_birth;
 	le->le_daddr = hdr->b_l2hdr.b_daddr;
 	if (index == 0)
 		dev->l2ad_log_blk_payload_start = le->le_daddr;
 	L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr));
 	L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr));
 	L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr));
 	le->le_complevel = hdr->b_complevel;
 	L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
 	L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
 	L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
 	L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
 
 	dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
 	    HDR_GET_PSIZE(hdr));
 
 	return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries);
 }
 
 /*
  * Checks whether a given L2ARC device address sits in a time-sequential
  * range. The trick here is that the L2ARC is a rotary buffer, so we can't
  * just do a range comparison, we need to handle the situation in which the
  * range wraps around the end of the L2ARC device. Arguments:
  *	bottom -- Lower end of the range to check (written to earlier).
  *	top    -- Upper end of the range to check (written to later).
  *	check  -- The address for which we want to determine if it sits in
  *		  between the top and bottom.
  *
  * The 3-way conditional below represents the following cases:
  *
  *	bottom < top : Sequentially ordered case:
  *	  <check>--------+-------------------+
  *	                 |  (overlap here?)  |
  *	 L2ARC dev       V                   V
  *	 |---------------<bottom>============<top>--------------|
  *
  *	bottom > top: Looped-around case:
  *	                      <check>--------+------------------+
  *	                                     |  (overlap here?) |
  *	 L2ARC dev                           V                  V
  *	 |===============<top>---------------<bottom>===========|
  *	 ^               ^
  *	 |  (or here?)   |
  *	 +---------------+---------<check>
  *
  *	top == bottom : Just a single address comparison.
  */
 boolean_t
 l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
 {
 	if (bottom < top)
 		return (bottom <= check && check <= top);
 	else if (bottom > top)
 		return (check <= top || bottom <= check);
 	else
 		return (check == top);
 }
 
 EXPORT_SYMBOL(arc_buf_size);
 EXPORT_SYMBOL(arc_write);
 EXPORT_SYMBOL(arc_read);
 EXPORT_SYMBOL(arc_buf_info);
 EXPORT_SYMBOL(arc_getbuf_func);
 EXPORT_SYMBOL(arc_add_prune_callback);
 EXPORT_SYMBOL(arc_remove_prune_callback);
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min,
 	spl_param_get_u64, ZMOD_RW, "Minimum ARC size in bytes");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max,
 	spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_balance, UINT, ZMOD_RW,
 	"Balance between metadata and data on ghost hits.");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int,
 	param_get_uint, ZMOD_RW, "Seconds before growing ARC size");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int,
 	param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW,
 	"Percent of pagecache to reclaim ARC to");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, UINT, ZMOD_RD,
 	"Target average block size");
 
 ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW,
 	"Disable compressed ARC buffers");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int,
 	param_get_uint, ZMOD_RW, "Min life of prefetch block in ms");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms,
     param_set_arc_int, param_get_uint, ZMOD_RW,
 	"Min life of prescient prefetched block in ms");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW,
 	"Max write bytes per interval");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, U64, ZMOD_RW,
 	"Extra write bytes during device warmup");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW,
 	"Number of max device writes to precache");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, U64, ZMOD_RW,
 	"Compressed l2arc_headroom multiplier");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, U64, ZMOD_RW,
 	"TRIM ahead L2ARC write size multiplier");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, U64, ZMOD_RW,
 	"Seconds between L2ARC writing");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, U64, ZMOD_RW,
 	"Min feed interval in milliseconds");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW,
 	"Skip caching prefetched buffers");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW,
 	"Turbo L2ARC warmup");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW,
 	"No reads during writes");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, UINT, ZMOD_RW,
 	"Percent of ARC size allowed for L2ARC-only headers");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW,
 	"Rebuild the L2ARC when importing a pool");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, U64, ZMOD_RW,
 	"Min size in bytes to write rebuild log blocks in L2ARC");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW,
 	"Cache only MFU data from ARC into L2ARC");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW,
 	"Exclude dbufs on special vdevs from being cached to L2ARC if set.");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,
 	param_get_uint, ZMOD_RW, "System free memory I/O throttle in bytes");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_u64,
 	spl_param_get_u64, ZMOD_RW, "System free memory target size in bytes");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_u64,
 	spl_param_get_u64, ZMOD_RW, "Minimum bytes of dnodes in ARC");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent,
     param_set_arc_int, param_get_uint, ZMOD_RW,
 	"Percent of ARC meta buffers for dnodes");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, UINT, ZMOD_RW,
 	"Percentage of excess dnodes to try to unpin");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW,
 	"When full, ARC allocation waits for eviction of this % of alloc size");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
 	"The number of headers to evict per sublist before moving to the next");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
 	"Number of arc_prune threads");
diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c
index e772caead29b..96e1601c4e9c 100644
--- a/module/zfs/bpobj.c
+++ b/module/zfs/bpobj.c
@@ -1,1030 +1,1031 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2017 Datto Inc.
  */
 
 #include <sys/bpobj.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_refcount.h>
 #include <sys/dsl_pool.h>
 #include <sys/zfeature.h>
 #include <sys/zap.h>
 
 /*
  * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
  */
 uint64_t
 bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_objset_spa(os);
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
 		if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
 			ASSERT0(dp->dp_empty_bpobj);
 			dp->dp_empty_bpobj =
 			    bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
 			VERIFY(zap_add(os,
 			    DMU_POOL_DIRECTORY_OBJECT,
 			    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
 			    &dp->dp_empty_bpobj, tx) == 0);
 		}
 		spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
 		ASSERT(dp->dp_empty_bpobj != 0);
 		return (dp->dp_empty_bpobj);
 	} else {
 		return (bpobj_alloc(os, blocksize, tx));
 	}
 }
 
 void
 bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
 	spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
 	if (!spa_feature_is_active(dmu_objset_spa(os),
 	    SPA_FEATURE_EMPTY_BPOBJ)) {
 		VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_EMPTY_BPOBJ, tx));
 		VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
 		dp->dp_empty_bpobj = 0;
 	}
 }
 
 uint64_t
 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
 {
 	int size;
 
 	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
 		size = BPOBJ_SIZE_V0;
 	else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
 		size = BPOBJ_SIZE_V1;
 	else if (!spa_feature_is_active(dmu_objset_spa(os),
 	    SPA_FEATURE_LIVELIST))
 		size = BPOBJ_SIZE_V2;
 	else
 		size = sizeof (bpobj_phys_t);
 
 	return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
 	    DMU_OT_BPOBJ_HDR, size, tx));
 }
 
 void
 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
 {
 	int64_t i;
 	bpobj_t bpo;
 	dmu_object_info_t doi;
 	int epb;
 	dmu_buf_t *dbuf = NULL;
 
 	ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
 	VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
 
 	mutex_enter(&bpo.bpo_lock);
 
 	if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
 		goto out;
 
 	VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
 	epb = doi.doi_data_block_size / sizeof (uint64_t);
 
 	for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
 		uint64_t *objarray;
 		uint64_t offset, blkoff;
 
 		offset = i * sizeof (uint64_t);
 		blkoff = P2PHASE(i, epb);
 
 		if (dbuf == NULL || dbuf->db_offset > offset) {
 			if (dbuf)
 				dmu_buf_rele(dbuf, FTAG);
 			VERIFY3U(0, ==, dmu_buf_hold(os,
 			    bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
 		}
 
 		ASSERT3U(offset, >=, dbuf->db_offset);
 		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
 
 		objarray = dbuf->db_data;
 		bpobj_free(os, objarray[blkoff], tx);
 	}
 	if (dbuf) {
 		dmu_buf_rele(dbuf, FTAG);
 		dbuf = NULL;
 	}
 	VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
 
 out:
 	mutex_exit(&bpo.bpo_lock);
 	bpobj_close(&bpo);
 
 	VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
 }
 
 int
 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
 {
 	dmu_object_info_t doi;
 	int err;
 
 	err = dmu_object_info(os, object, &doi);
 	if (err)
 		return (err);
 
 	memset(bpo, 0, sizeof (*bpo));
 	mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	ASSERT(bpo->bpo_dbuf == NULL);
 	ASSERT(bpo->bpo_phys == NULL);
 	ASSERT(object != 0);
 	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
 	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
 
 	err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
 	if (err)
 		return (err);
 
 	bpo->bpo_os = os;
 	bpo->bpo_object = object;
 	bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
 	bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
 	bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
 	bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2);
 	bpo->bpo_phys = bpo->bpo_dbuf->db_data;
 	return (0);
 }
 
 boolean_t
 bpobj_is_open(const bpobj_t *bpo)
 {
 	return (bpo->bpo_object != 0);
 }
 
 void
 bpobj_close(bpobj_t *bpo)
 {
 	/* Lame workaround for closing a bpobj that was never opened. */
 	if (bpo->bpo_object == 0)
 		return;
 
 	dmu_buf_rele(bpo->bpo_dbuf, bpo);
 	if (bpo->bpo_cached_dbuf != NULL)
 		dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
 	bpo->bpo_dbuf = NULL;
 	bpo->bpo_phys = NULL;
 	bpo->bpo_cached_dbuf = NULL;
 	bpo->bpo_object = 0;
 
 	mutex_destroy(&bpo->bpo_lock);
 }
 
 static boolean_t
 bpobj_is_empty_impl(bpobj_t *bpo)
 {
 	ASSERT(MUTEX_HELD(&bpo->bpo_lock));
 	return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
 	    (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
 }
 
 boolean_t
 bpobj_is_empty(bpobj_t *bpo)
 {
 	mutex_enter(&bpo->bpo_lock);
 	boolean_t is_empty = bpobj_is_empty_impl(bpo);
 	mutex_exit(&bpo->bpo_lock);
 	return (is_empty);
 }
 
 /*
  * A recursive iteration of the bpobjs would be nice here but we run the risk
  * of overflowing function stack space.  Instead, find each subobj and add it
  * to the head of our list so it can be scanned for subjobjs.  Like a
  * recursive implementation, the "deepest" subobjs will be freed first.
  * When a subobj is found to have no additional subojs, free it.
  */
 typedef struct bpobj_info {
 	bpobj_t *bpi_bpo;
 	/*
 	 * This object is a subobj of bpi_parent,
 	 * at bpi_index in its subobj array.
 	 */
 	struct bpobj_info *bpi_parent;
 	uint64_t bpi_index;
 	/* How many of our subobj's are left to process. */
 	uint64_t bpi_unprocessed_subobjs;
 	/* True after having visited this bpo's directly referenced BPs. */
 	boolean_t bpi_visited;
 	list_node_t bpi_node;
 } bpobj_info_t;
 
 static bpobj_info_t *
 bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
 {
 	bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP);
 	bpi->bpi_bpo = bpo;
 	bpi->bpi_parent = parent;
 	bpi->bpi_index = index;
 	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
 		bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs;
 	}
 	return (bpi);
 }
 
 /*
  * Update bpobj and all of its parents with new space accounting.
  */
 static void
 propagate_space_reduction(bpobj_info_t *bpi, int64_t freed,
     int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx)
 {
 
 	for (; bpi != NULL; bpi = bpi->bpi_parent) {
 		bpobj_t *p = bpi->bpi_bpo;
 		ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx));
 		p->bpo_phys->bpo_bytes -= freed;
 		ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0);
 		if (p->bpo_havecomp) {
 			p->bpo_phys->bpo_comp -= comp_freed;
 			p->bpo_phys->bpo_uncomp -= uncomp_freed;
 		}
 	}
 }
 
 static int
 bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
     int64_t start, dmu_tx_t *tx, boolean_t free)
 {
 	int err = 0;
 	int64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
 	dmu_buf_t *dbuf = NULL;
 	bpobj_t *bpo = bpi->bpi_bpo;
 
 	int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1;
 	uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) *
 	    sizeof (blkptr_t);
 	uint64_t ps = start * sizeof (blkptr_t);
 	uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0,
 	    ps);
 	if (pe > pb) {
 		dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb,
 		    ZIO_PRIORITY_ASYNC_READ);
 	}
 	for (; i >= start; i--) {
 		uint64_t offset = i * sizeof (blkptr_t);
 		uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
 
 		if (dbuf == NULL || dbuf->db_offset > offset) {
 			if (dbuf)
 				dmu_buf_rele(dbuf, FTAG);
 			err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
 			    offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH);
 			if (err)
 				break;
 			pe = pb;
 			pb = MAX((dbuf->db_offset > dmu_prefetch_max) ?
 			    dbuf->db_offset - dmu_prefetch_max : 0, ps);
 			if (pe > pb) {
 				dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
 				    pb, pe - pb, ZIO_PRIORITY_ASYNC_READ);
 			}
 		}
 
 		ASSERT3U(offset, >=, dbuf->db_offset);
 		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
 
 		blkptr_t *bparray = dbuf->db_data;
 		blkptr_t *bp = &bparray[blkoff];
 
 		boolean_t bp_freed = BP_GET_FREE(bp);
 		err = func(arg, bp, bp_freed, tx);
 		if (err)
 			break;
 
 		if (free) {
 			int sign = bp_freed ? -1 : +1;
 			spa_t *spa = dmu_objset_spa(bpo->bpo_os);
 			freed += sign * bp_get_dsize_sync(spa, bp);
 			comp_freed += sign * BP_GET_PSIZE(bp);
 			uncomp_freed += sign * BP_GET_UCSIZE(bp);
 			ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
 			bpo->bpo_phys->bpo_num_blkptrs--;
 			ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
 			if (bp_freed) {
 				ASSERT(bpo->bpo_havefreed);
 				bpo->bpo_phys->bpo_num_freed--;
 				ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0);
 			}
 		}
 	}
 	if (free) {
 		propagate_space_reduction(bpi, freed, comp_freed,
 		    uncomp_freed, tx);
 		VERIFY0(dmu_free_range(bpo->bpo_os,
 		    bpo->bpo_object,
 		    bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
 		    DMU_OBJECT_END, tx));
 	}
 	if (dbuf) {
 		dmu_buf_rele(dbuf, FTAG);
 		dbuf = NULL;
 	}
 	return (err);
 }
 
 /*
  * Given an initial bpo, start by freeing the BPs that are directly referenced
  * by that bpo. If the bpo has subobjs, read in its last subobj and push the
  * subobj to our stack. By popping items off our stack, eventually we will
  * encounter a bpo that has no subobjs.  We can free its bpobj_info_t, and if
  * requested also free the now-empty bpo from disk and decrement
  * its parent's subobj count. We continue popping each subobj from our stack,
  * visiting its last subobj until they too have no more subobjs, and so on.
  */
 static int
 bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
     dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size)
 {
 	list_t stack;
 	bpobj_info_t *bpi;
 	int err = 0;
 
 	/*
 	 * Create a "stack" for us to work with without worrying about
 	 * stack overflows. Initialize it with the initial_bpo.
 	 */
 	list_create(&stack, sizeof (bpobj_info_t),
 	    offsetof(bpobj_info_t, bpi_node));
 	mutex_enter(&initial_bpo->bpo_lock);
 
 	if (bpobj_size != NULL)
 		*bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs;
 
 	list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));
 
 	while ((bpi = list_head(&stack)) != NULL) {
 		bpobj_t *bpo = bpi->bpi_bpo;
 
 		ASSERT3P(bpo, !=, NULL);
 		ASSERT(MUTEX_HELD(&bpo->bpo_lock));
 		ASSERT(bpobj_is_open(bpo));
 
 		if (free)
 			dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
 
 		if (bpi->bpi_visited == B_FALSE) {
 			err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx,
 			    free);
 			bpi->bpi_visited = B_TRUE;
 			if (err != 0)
 				break;
 		}
 		/*
 		 * We've finished with this bpo's directly-referenced BP's and
 		 * it has no more unprocessed subobjs. We can free its
 		 * bpobj_info_t (unless it is the topmost, initial_bpo).
 		 * If we are freeing from disk, we can also do that.
 		 */
 		if (bpi->bpi_unprocessed_subobjs == 0) {
 			/*
 			 * If there are no entries, there should
 			 * be no bytes.
 			 */
 			if (bpobj_is_empty_impl(bpo)) {
 				ASSERT0(bpo->bpo_phys->bpo_bytes);
 				ASSERT0(bpo->bpo_phys->bpo_comp);
 				ASSERT0(bpo->bpo_phys->bpo_uncomp);
 			}
 
 			/* The initial_bpo has no parent and is not closed. */
 			if (bpi->bpi_parent != NULL) {
 				if (free) {
 					bpobj_t *p = bpi->bpi_parent->bpi_bpo;
 
 					ASSERT0(bpo->bpo_phys->bpo_num_blkptrs);
 					ASSERT3U(p->bpo_phys->bpo_num_subobjs,
 					    >, 0);
 					ASSERT3U(bpi->bpi_index, ==,
 					    p->bpo_phys->bpo_num_subobjs - 1);
 					ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf,
 					    tx));
 
 					p->bpo_phys->bpo_num_subobjs--;
 
 					VERIFY0(dmu_free_range(p->bpo_os,
 					    p->bpo_phys->bpo_subobjs,
 					    bpi->bpi_index * sizeof (uint64_t),
 					    sizeof (uint64_t), tx));
 
 					/* eliminate the empty subobj list */
 					if (bpo->bpo_havesubobj &&
 					    bpo->bpo_phys->bpo_subobjs != 0) {
 						ASSERT0(bpo->bpo_phys->
 						    bpo_num_subobjs);
 						err = dmu_object_free(
 						    bpo->bpo_os,
 						    bpo->bpo_phys->bpo_subobjs,
 						    tx);
 						if (err)
 							break;
 						bpo->bpo_phys->bpo_subobjs = 0;
 					}
 					err = dmu_object_free(p->bpo_os,
 					    bpo->bpo_object, tx);
 					if (err)
 						break;
 				}
 
 				mutex_exit(&bpo->bpo_lock);
 				bpobj_close(bpo);
 				kmem_free(bpo, sizeof (bpobj_t));
 			} else {
 				mutex_exit(&bpo->bpo_lock);
 			}
 
 			/*
 			 * Finished processing this bpo. Unlock, and free
 			 * our "stack" info.
 			 */
 			list_remove_head(&stack);
 			kmem_free(bpi, sizeof (bpobj_info_t));
 		} else {
 			/*
 			 * We have unprocessed subobjs. Process the next one.
 			 */
 			ASSERT(bpo->bpo_havecomp);
 			ASSERT3P(bpobj_size, ==, NULL);
 
 			/* Add the last subobj to stack. */
 			int64_t i = bpi->bpi_unprocessed_subobjs - 1;
 			uint64_t offset = i * sizeof (uint64_t);
 
 			uint64_t subobj;
 			err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
 			    offset, sizeof (uint64_t), &subobj,
 			    DMU_READ_NO_PREFETCH);
 			if (err)
 				break;
 
 			bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t),
 			    KM_SLEEP);
 			err = bpobj_open(subbpo, bpo->bpo_os, subobj);
 			if (err) {
 				kmem_free(subbpo, sizeof (bpobj_t));
 				break;
 			}
 
 			if (subbpo->bpo_havesubobj &&
 			    subbpo->bpo_phys->bpo_subobjs != 0) {
 				dmu_prefetch(subbpo->bpo_os,
 				    subbpo->bpo_phys->bpo_subobjs, 0, 0, 0,
 				    ZIO_PRIORITY_ASYNC_READ);
 			}
 
 			list_insert_head(&stack, bpi_alloc(subbpo, bpi, i));
 			mutex_enter(&subbpo->bpo_lock);
 			bpi->bpi_unprocessed_subobjs--;
 		}
 	}
 	/*
 	 * Cleanup anything left on the "stack" after we left the loop.
 	 * Every bpo on the stack is locked so we must remember to undo
 	 * that now (in LIFO order).
 	 */
 	while ((bpi = list_remove_head(&stack)) != NULL) {
 		bpobj_t *bpo = bpi->bpi_bpo;
 		ASSERT(err != 0);
 		ASSERT3P(bpo, !=, NULL);
 
 		mutex_exit(&bpo->bpo_lock);
 
 		/* do not free the initial_bpo */
 		if (bpi->bpi_parent != NULL) {
 			bpobj_close(bpi->bpi_bpo);
 			kmem_free(bpi->bpi_bpo, sizeof (bpobj_t));
 		}
 		kmem_free(bpi, sizeof (bpobj_info_t));
 	}
 
 	list_destroy(&stack);
 
 	return (err);
 }
 
 /*
  * Iterate and remove the entries.  If func returns nonzero, iteration
  * will stop and that entry will not be removed.
  */
 int
 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
 {
 	return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL));
 }
 
 /*
  * Iterate the entries.  If func returns nonzero, iteration will stop.
  *
  * If there are no subobjs:
  *
  * *bpobj_size can be used to return the number of block pointers in the
  * bpobj.  Note that this may be different from the number of block pointers
  * that are iterated over, if iteration is terminated early (e.g. by the func
  * returning nonzero).
  *
  * If there are concurrent (or subsequent) modifications to the bpobj then the
  * returned *bpobj_size can be passed as "start" to
  * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.
  */
 int
 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
     uint64_t *bpobj_size)
 {
 	return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size));
 }
 
 /*
  * Iterate over the blkptrs in the bpobj beginning at index start. If func
  * returns nonzero, iteration will stop. This is a livelist specific function
  * since it assumes that there are no subobjs present.
  */
 int
 livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
     int64_t start)
 {
 	if (bpo->bpo_havesubobj)
 		VERIFY0(bpo->bpo_phys->bpo_subobjs);
 	bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0);
 	int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE);
 	kmem_free(bpi, sizeof (bpobj_info_t));
 	return (err);
 }
 
 /*
  * Logically add subobj's contents to the parent bpobj.
  *
  * In the most general case, this is accomplished in constant time by adding
  * a reference to subobj.  This case is used when enqueuing a large subobj:
  * +--------------+                        +--------------+
  * | bpobj        |----------------------->| subobj list  |
  * +----+----+----+----+----+              +-----+-----+--+--+
  * | bp | bp | bp | bp | bp |              | obj | obj | obj |
  * +----+----+----+----+----+              +-----+-----+-----+
  *
  * +--------------+                        +--------------+
  * | sub-bpobj    |----------------------> | subsubobj    |
  * +----+----+----+----+---------+----+    +-----+-----+--+--------+-----+
  * | bp | bp | bp | bp |   ...   | bp |    | obj | obj |    ...    | obj |
  * +----+----+----+----+---------+----+    +-----+-----+-----------+-----+
  *
  * Result: sub-bpobj added to parent's subobj list.
  * +--------------+                        +--------------+
  * | bpobj        |----------------------->| subobj list  |
  * +----+----+----+----+----+              +-----+-----+--+--+-----+
  * | bp | bp | bp | bp | bp |              | obj | obj | obj | OBJ |
  * +----+----+----+----+----+              +-----+-----+-----+--|--+
  *                                                              |
  *       /-----------------------------------------------------/
  *       v
  * +--------------+                        +--------------+
  * | sub-bpobj    |----------------------> | subsubobj    |
  * +----+----+----+----+---------+----+    +-----+-----+--+--------+-----+
  * | bp | bp | bp | bp |   ...   | bp |    | obj | obj |    ...    | obj |
  * +----+----+----+----+---------+----+    +-----+-----+-----------+-----+
  *
  *
  * In a common case, the subobj is small: its bp's and its list of subobj's
  * are each stored in a single block.  In this case we copy the subobj's
  * contents to the parent:
  * +--------------+                        +--------------+
  * | bpobj        |----------------------->| subobj list  |
  * +----+----+----+----+----+              +-----+-----+--+--+
  * | bp | bp | bp | bp | bp |              | obj | obj | obj |
  * +----+----+----+----+----+              +-----+-----+-----+
  *                          ^                                ^
  * +--------------+         |              +--------------+  |
  * | sub-bpobj    |---------^------------> | subsubobj    |  ^
  * +----+----+----+         |              +-----+-----+--+  |
  * | BP | BP |-->-->-->-->-/               | OBJ | OBJ |-->-/
  * +----+----+                             +-----+-----+
  *
  * Result: subobj destroyed, contents copied to parent:
  * +--------------+                        +--------------+
  * | bpobj        |----------------------->| subobj list  |
  * +----+----+----+----+----+----+----+    +-----+-----+--+--+-----+-----+
  * | bp | bp | bp | bp | bp | BP | BP |    | obj | obj | obj | OBJ | OBJ |
  * +----+----+----+----+----+----+----+    +-----+-----+-----+-----+-----+
  *
  *
  * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's
  * but retain the sub-bpobj:
  * +--------------+                        +--------------+
  * | bpobj        |----------------------->| subobj list  |
  * +----+----+----+----+----+              +-----+-----+--+--+
  * | bp | bp | bp | bp | bp |              | obj | obj | obj |
  * +----+----+----+----+----+              +-----+-----+-----+
  *                                                           ^
  * +--------------+                        +--------------+  |
  * | sub-bpobj    |----------------------> | subsubobj    |  ^
  * +----+----+----+----+---------+----+    +-----+-----+--+  |
  * | bp | bp | bp | bp |   ...   | bp |    | OBJ | OBJ |-->-/
  * +----+----+----+----+---------+----+    +-----+-----+
  *
  * Result: sub-sub-bpobjs and subobj added to parent's subobj list.
  * +--------------+                     +--------------+
  * | bpobj        |-------------------->| subobj list  |
  * +----+----+----+----+----+           +-----+-----+--+--+-----+-----+------+
  * | bp | bp | bp | bp | bp |           | obj | obj | obj | OBJ | OBJ | OBJ* |
  * +----+----+----+----+----+           +-----+-----+-----+-----+-----+--|---+
  *                                                                       |
  *       /--------------------------------------------------------------/
  *       v
  * +--------------+
  * | sub-bpobj    |
  * +----+----+----+----+---------+----+
  * | bp | bp | bp | bp |   ...   | bp |
  * +----+----+----+----+---------+----+
  */
 void
 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 {
 	bpobj_t subbpo;
 	uint64_t used, comp, uncomp, subsubobjs;
 	boolean_t copy_subsub = B_TRUE;
 	boolean_t copy_bps = B_TRUE;
 
 	ASSERT(bpobj_is_open(bpo));
 	ASSERT(subobj != 0);
 	ASSERT(bpo->bpo_havesubobj);
 	ASSERT(bpo->bpo_havecomp);
 	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
 
 	if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
 		bpobj_decr_empty(bpo->bpo_os, tx);
 		return;
 	}
 
 	VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
 	if (bpobj_is_empty(&subbpo)) {
 		/* No point in having an empty subobj. */
 		bpobj_close(&subbpo);
 		bpobj_free(bpo->bpo_os, subobj, tx);
 		return;
 	}
 	VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
 
 	mutex_enter(&bpo->bpo_lock);
 	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
 
 	dmu_object_info_t doi;
 
 	if (bpo->bpo_phys->bpo_subobjs != 0) {
 		ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
 		    &doi));
 		ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
 	}
 
 	/*
 	 * If subobj has only one block of subobjs, then move subobj's
 	 * subobjs to bpo's subobj list directly.  This reduces recursion in
 	 * bpobj_iterate due to nested subobjs.
 	 */
 	subsubobjs = subbpo.bpo_phys->bpo_subobjs;
 	if (subsubobjs != 0) {
 		VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
 		if (doi.doi_max_offset > doi.doi_data_block_size) {
 			copy_subsub = B_FALSE;
 		}
 	}
 
 	/*
 	 * If, in addition to having only one block of subobj's, subobj has
 	 * only one block of bp's, then move subobj's bp's to bpo's bp list
 	 * directly. This reduces recursion in bpobj_iterate due to nested
 	 * subobjs.
 	 */
 	VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi));
 	if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) {
 		copy_bps = B_FALSE;
 	}
 
 	if (copy_subsub && subsubobjs != 0) {
 		dmu_buf_t *subdb;
 		uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
 
 		VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs,
 		    0, FTAG, &subdb, 0));
 		/*
 		 * Make sure that we are not asking dmu_write()
 		 * to write more data than we have in our buffer.
 		 */
 		VERIFY3U(subdb->db_size, >=,
 		    numsubsub * sizeof (subobj));
 		if (bpo->bpo_phys->bpo_subobjs == 0) {
 			bpo->bpo_phys->bpo_subobjs =
 			    dmu_object_alloc(bpo->bpo_os,
 			    DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
 			    DMU_OT_NONE, 0, tx);
 		}
 		dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
 		    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
 		    numsubsub * sizeof (subobj), subdb->db_data, tx);
 		dmu_buf_rele(subdb, FTAG);
 		bpo->bpo_phys->bpo_num_subobjs += numsubsub;
 
 		dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
 		subbpo.bpo_phys->bpo_subobjs = 0;
 		VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx));
 	}
 
 	if (copy_bps) {
 		dmu_buf_t *bps;
 		uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs;
 
 		ASSERT(copy_subsub);
 		VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj,
 		    0, FTAG, &bps, 0));
 
 		/*
 		 * Make sure that we are not asking dmu_write()
 		 * to write more data than we have in our buffer.
 		 */
 		VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t));
 		dmu_write(bpo->bpo_os, bpo->bpo_object,
 		    bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
 		    numbps * sizeof (blkptr_t),
 		    bps->db_data, tx);
 		dmu_buf_rele(bps, FTAG);
 		bpo->bpo_phys->bpo_num_blkptrs += numbps;
 
 		bpobj_close(&subbpo);
 		VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx));
 	} else {
 		bpobj_close(&subbpo);
 		if (bpo->bpo_phys->bpo_subobjs == 0) {
 			bpo->bpo_phys->bpo_subobjs =
 			    dmu_object_alloc(bpo->bpo_os,
 			    DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
 			    DMU_OT_NONE, 0, tx);
 		}
 
 		dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
 		    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
 		    sizeof (subobj), &subobj, tx);
 		bpo->bpo_phys->bpo_num_subobjs++;
 	}
 
 	bpo->bpo_phys->bpo_bytes += used;
 	bpo->bpo_phys->bpo_comp += comp;
 	bpo->bpo_phys->bpo_uncomp += uncomp;
 	mutex_exit(&bpo->bpo_lock);
 
 }
 
 /*
  * Prefetch metadata required for bpobj_enqueue_subobj().
  */
 void
 bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj)
 {
 	dmu_object_info_t doi;
 	bpobj_t subbpo;
 	uint64_t subsubobjs;
 	boolean_t copy_subsub = B_TRUE;
 	boolean_t copy_bps = B_TRUE;
 
 	ASSERT(bpobj_is_open(bpo));
 	ASSERT(subobj != 0);
 
 	if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj)
 		return;
 
 	if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0)
 		return;
 	if (bpobj_is_empty(&subbpo)) {
 		bpobj_close(&subbpo);
 		return;
 	}
 	subsubobjs = subbpo.bpo_phys->bpo_subobjs;
 	bpobj_close(&subbpo);
 
 	if (subsubobjs != 0) {
 		if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0)
 			return;
 		if (doi.doi_max_offset > doi.doi_data_block_size)
 			copy_subsub = B_FALSE;
 	}
 
 	if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0)
 		return;
 	if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub)
 		copy_bps = B_FALSE;
 
 	if (copy_subsub && subsubobjs != 0) {
 		if (bpo->bpo_phys->bpo_subobjs) {
 			dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
 			    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
 			    ZIO_PRIORITY_ASYNC_READ);
 		}
 		dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1,
 		    ZIO_PRIORITY_ASYNC_READ);
 	}
 
 	if (copy_bps) {
 		dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
 		    bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1,
 		    ZIO_PRIORITY_ASYNC_READ);
 		dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1,
 		    ZIO_PRIORITY_ASYNC_READ);
 	} else if (bpo->bpo_phys->bpo_subobjs) {
 		dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
 		    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
 		    ZIO_PRIORITY_ASYNC_READ);
 	}
 }
 
 void
 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	blkptr_t stored_bp = *bp;
 	uint64_t offset;
 	int blkoff;
 	blkptr_t *bparray;
 
 	ASSERT(bpobj_is_open(bpo));
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
 
 	if (BP_IS_EMBEDDED(bp)) {
 		/*
 		 * The bpobj will compress better without the payload.
 		 *
 		 * Note that we store EMBEDDED bp's because they have an
 		 * uncompressed size, which must be accounted for.  An
 		 * alternative would be to add their size to bpo_uncomp
 		 * without storing the bp, but that would create additional
 		 * complications: bpo_uncomp would be inconsistent with the
 		 * set of BP's stored, and bpobj_iterate() wouldn't visit
 		 * all the space accounted for in the bpobj.
 		 */
 		memset(&stored_bp, 0, sizeof (stored_bp));
 		stored_bp.blk_prop = bp->blk_prop;
-		stored_bp.blk_birth = bp->blk_birth;
+		BP_SET_LOGICAL_BIRTH(&stored_bp, BP_GET_LOGICAL_BIRTH(bp));
 	} else if (!BP_GET_DEDUP(bp)) {
 		/* The bpobj will compress better without the checksum */
 		memset(&stored_bp.blk_cksum, 0, sizeof (stored_bp.blk_cksum));
 	}
 
 	stored_bp.blk_fill = 0;
 	BP_SET_FREE(&stored_bp, bp_freed);
 
 	mutex_enter(&bpo->bpo_lock);
 
 	offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
 	blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
 
 	if (bpo->bpo_cached_dbuf == NULL ||
 	    offset < bpo->bpo_cached_dbuf->db_offset ||
 	    offset >= bpo->bpo_cached_dbuf->db_offset +
 	    bpo->bpo_cached_dbuf->db_size) {
 		if (bpo->bpo_cached_dbuf)
 			dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
 		VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
 		    offset, bpo, &bpo->bpo_cached_dbuf, 0));
 		ASSERT3P(bpo->bpo_cached_dbuf, !=, NULL);
 	}
 
 	dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
 	bparray = bpo->bpo_cached_dbuf->db_data;
 	bparray[blkoff] = stored_bp;
 
 	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
 	bpo->bpo_phys->bpo_num_blkptrs++;
 	int sign = bp_freed ? -1 : +1;
 	bpo->bpo_phys->bpo_bytes += sign *
 	    bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
 	if (bpo->bpo_havecomp) {
 		bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp);
 		bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp);
 	}
 	if (bp_freed) {
 		ASSERT(bpo->bpo_havefreed);
 		bpo->bpo_phys->bpo_num_freed++;
 	}
 	mutex_exit(&bpo->bpo_lock);
 }
 
 struct space_range_arg {
 	spa_t *spa;
 	uint64_t mintxg;
 	uint64_t maxtxg;
 	uint64_t used;
 	uint64_t comp;
 	uint64_t uncomp;
 };
 
 static int
 space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 {
 	(void) bp_freed, (void) tx;
 	struct space_range_arg *sra = arg;
 
-	if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
+	if (BP_GET_LOGICAL_BIRTH(bp) > sra->mintxg &&
+	    BP_GET_LOGICAL_BIRTH(bp) <= sra->maxtxg) {
 		if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
 			sra->used += bp_get_dsize_sync(sra->spa, bp);
 		else
 			sra->used += bp_get_dsize(sra->spa, bp);
 		sra->comp += BP_GET_PSIZE(bp);
 		sra->uncomp += BP_GET_UCSIZE(bp);
 	}
 	return (0);
 }
 
 int
 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	ASSERT(bpobj_is_open(bpo));
 	mutex_enter(&bpo->bpo_lock);
 
 	*usedp = bpo->bpo_phys->bpo_bytes;
 	if (bpo->bpo_havecomp) {
 		*compp = bpo->bpo_phys->bpo_comp;
 		*uncompp = bpo->bpo_phys->bpo_uncomp;
 		mutex_exit(&bpo->bpo_lock);
 		return (0);
 	} else {
 		mutex_exit(&bpo->bpo_lock);
 		return (bpobj_space_range(bpo, 0, UINT64_MAX,
 		    usedp, compp, uncompp));
 	}
 }
 
 /*
  * Return the amount of space in the bpobj which is:
- * mintxg < blk_birth <= maxtxg
+ * mintxg < logical birth <= maxtxg
  */
 int
 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	struct space_range_arg sra = { 0 };
 	int err;
 
 	ASSERT(bpobj_is_open(bpo));
 
 	/*
 	 * As an optimization, if they want the whole txg range, just
 	 * get bpo_bytes rather than iterating over the bps.
 	 */
 	if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
 		return (bpobj_space(bpo, usedp, compp, uncompp));
 
 	sra.spa = dmu_objset_spa(bpo->bpo_os);
 	sra.mintxg = mintxg;
 	sra.maxtxg = maxtxg;
 
 	err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
 	*usedp = sra.used;
 	*compp = sra.comp;
 	*uncompp = sra.uncomp;
 	return (err);
 }
 
 /*
  * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a
  * bpobj are designated as free or allocated that information is not preserved
  * in bplists.
  */
 int
 bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	(void) bp_freed, (void) tx;
 	bplist_t *bpl = arg;
 	bplist_append(bpl, bp);
 	return (0);
 }
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 416caeb11c7e..0b5a09df3724 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -1,1688 +1,1687 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/brt.h>
 #include <sys/brt_impl.h>
 #include <sys/ddt.h>
 #include <sys/bitmap.h>
 #include <sys/zap.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_scan.h>
 #include <sys/vdev_impl.h>
 #include <sys/kstat.h>
 #include <sys/wmsum.h>
 
 /*
  * Block Cloning design.
  *
  * Block Cloning allows to manually clone a file (or a subset of its blocks)
  * into another (or the same) file by just creating additional references to
  * the data blocks without copying the data itself. Those references are kept
  * in the Block Reference Tables (BRTs).
  *
  * In many ways this is similar to the existing deduplication, but there are
  * some important differences:
  *
  * - Deduplication is automatic and Block Cloning is not - one has to use a
  *   dedicated system call(s) to clone the given file/blocks.
  * - Deduplication keeps all data blocks in its table, even those referenced
  *   just once. Block Cloning creates an entry in its tables only when there
  *   are at least two references to the given data block. If the block was
  *   never explicitly cloned or the second to last reference was dropped,
  *   there will be neither space nor performance overhead.
  * - Deduplication needs data to work - one needs to pass real data to the
  *   write(2) syscall, so hash can be calculated. Block Cloning doesn't require
  *   data, just block pointers to the data, so it is extremely fast, as we pay
  *   neither the cost of reading the data, nor the cost of writing the data -
  *   we operate exclusively on metadata.
  * - If the D (dedup) bit is not set in the block pointer, it means that
  *   the block is not in the dedup table (DDT) and we won't consult the DDT
  *   when we need to free the block. Block Cloning must be consulted on every
  *   free, because we cannot modify the source BP (eg. by setting something
  *   similar to the D bit), thus we have no hint if the block is in the
  *   Block Reference Table (BRT), so we need to look into the BRT. There is
  *   an optimization in place that allows us to eliminate the majority of BRT
  *   lookups which is described below in the "Minimizing free penalty" section.
  * - The BRT entry is much smaller than the DDT entry - for BRT we only store
  *   64bit offset and 64bit reference counter.
  * - Dedup keys are cryptographic hashes, so two blocks that are close to each
  *   other on disk are most likely in totally different parts of the DDT.
  *   The BRT entry keys are offsets into a single top-level VDEV, so data blocks
  *   from one file should have BRT entries close to each other.
  * - Scrub will only do a single pass over a block that is referenced multiple
  *   times in the DDT. Unfortunately it is not currently (if at all) possible
  *   with Block Cloning and block referenced multiple times will be scrubbed
  *   multiple times. The new, sorted scrub should be able to eliminate
  *   duplicated reads given enough memory.
  * - Deduplication requires cryptographically strong hash as a checksum or
  *   additional data verification. Block Cloning works with any checksum
  *   algorithm or even with checksumming disabled.
  *
  * As mentioned above, the BRT entries are much smaller than the DDT entries.
  * To uniquely identify a block we just need its vdev id and offset. We also
  * need to maintain a reference counter. The vdev id will often repeat, as there
  * is a small number of top-level VDEVs and a large number of blocks stored in
  * each VDEV. We take advantage of that to reduce the BRT entry size further by
  * maintaining one BRT for each top-level VDEV, so we can then have only offset
  * and counter as the BRT entry.
  *
  * Minimizing free penalty.
  *
  * Block Cloning allows creating additional references to any existing block.
  * When we free a block there is no hint in the block pointer whether the block
  * was cloned or not, so on each free we have to check if there is a
  * corresponding entry in the BRT or not. If there is, we need to decrease
  * the reference counter. Doing BRT lookup on every free can potentially be
  * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
  * This is the main problem with deduplication, so we've learned our lesson and
  * try not to repeat the same mistake here. How do we do that? We divide each
  * top-level VDEV into 16MB regions. For each region we maintain a counter that
  * is a sum of all the BRT entries that have offsets within the region. This
  * creates the entries count array of 16bit numbers for each top-level VDEV.
  * The entries count array is always kept in memory and updated on disk in the
  * same transaction group as the BRT updates to keep everything in-sync. We can
  * keep the array in memory, because it is very small. With 16MB regions and
  * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
  * the region size even further in the future). Now, when we want to free
  * a block, we first consult the array. If the counter for the whole region is
  * zero, there is no need to look for the BRT entry, as there isn't one for
  * sure. If the counter for the region is greater than zero, only then we will
  * do a BRT lookup and if an entry is found we will decrease the reference
  * counter in the BRT entry and in the entry counters array.
  *
  * The entry counters array is small, but can potentially be larger for very
  * large VDEVs or smaller regions. In this case we don't want to rewrite entire
  * array on every change. We then divide the array into 32kB block and keep
  * a bitmap of dirty blocks within a transaction group. When we sync the
  * transaction group we can only update the parts of the entry counters array
  * that were modified. Note: Keeping track of the dirty parts of the entry
  * counters array is implemented, but updating only parts of the array on disk
  * is not yet implemented - for now we will update entire array if there was
  * any change.
  *
  * The implementation tries to be economic: if BRT is not used, or no longer
  * used, there will be no entries in the MOS and no additional memory used (eg.
  * the entry counters array is only allocated if needed).
  *
  * Interaction between Deduplication and Block Cloning.
  *
  * If both functionalities are in use, we could end up with a block that is
  * referenced multiple times in both DDT and BRT. When we free one of the
  * references we couldn't tell where it belongs, so we would have to decide
  * what table takes the precedence: do we first clear DDT references or BRT
  * references? To avoid this dilemma BRT cooperates with DDT - if a given block
  * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
  * lookup DDT entry instead and increase the counter there. No BRT entry
  * will be created for a block which has the D (dedup) bit set.
  * BRT may be more efficient for manual deduplication, but if the block is
  * already in the DDT, then creating additional BRT entry would be less
  * efficient. This clever idea was proposed by Allan Jude.
  *
  * Block Cloning across datasets.
  *
  * Block Cloning is not limited to cloning blocks within the same dataset.
  * It is possible (and very useful) to clone blocks between different datasets.
  * One use case is recovering files from snapshots. By cloning the files into
  * dataset we need no additional storage. Without Block Cloning we would need
  * additional space for those files.
  * Another interesting use case is moving the files between datasets
  * (copying the file content to the new dataset and removing the source file).
  * In that case Block Cloning will only be used briefly, because the BRT entries
  * will be removed when the source is removed.
  * Block Cloning across encrypted datasets is supported as long as both
  * datasets share the same master key (e.g. snapshots and clones)
  *
  * Block Cloning flow through ZFS layers.
  *
  * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
  * blocks. As of this writing no interface is implemented that allows for block
  * cloning within a ZVOL.
  * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
  * for blocking cloning.
  *
  *	ssize_t
  *	copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
  *	                size_t len, unsigned int flags);
  *
  * Even though offsets and length represent bytes, they have to be
  * block-aligned or we will return an error so the upper layer can
  * fallback to the generic mechanism that will just copy the data.
  * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
  * This function was implemented based on zfs_write(), but instead of writing
  * the given data we first read block pointers using the new dmu_read_l0_bps()
  * function from the source file. Once we have BPs from the source file we call
  * the dmu_brt_clone() function on the destination file. This function
  * allocates BPs for us. We iterate over all source BPs. If the given BP is
  * a hole or an embedded block, we just copy BP as-is. If it points to a real
  * data we place this BP on a BRT pending list using the brt_pending_add()
  * function.
  *
  * We use this pending list to keep track of all BPs that got new references
  * within this transaction group.
  *
  * Some special cases to consider and how we address them:
  * - The block we want to clone may have been created within the same
  *   transaction group that we are trying to clone. Such block has no BP
  *   allocated yet, so cannot be immediately cloned. We return EAGAIN.
  * - The block we want to clone may have been modified within the same
  *   transaction group. We return EAGAIN.
  * - A block may be cloned multiple times during one transaction group (that's
  *   why pending list is actually a tree and not an append-only list - this
  *   way we can figure out faster if this block is cloned for the first time
  *   in this txg or consecutive time).
  * - A block may be cloned and freed within the same transaction group
  *   (see dbuf_undirty()).
  * - A block may be cloned and within the same transaction group the clone
  *   can be cloned again (see dmu_read_l0_bps()).
  * - A file might have been deleted, but the caller still has a file descriptor
  *   open to this file and clones it.
  *
  * When we free a block we have an additional step in the ZIO pipeline where we
  * call the zio_brt_free() function. We then call the brt_entry_decref()
  * that loads the corresponding BRT entry (if one exists) and decreases
  * reference counter. If this is not the last reference we will stop ZIO
  * pipeline here. If this is the last reference or the block is not in the
  * BRT, we continue the pipeline and free the block as usual.
  *
  * At the beginning of spa_sync() where there can be no more block cloning,
  * but before issuing frees we call brt_pending_apply(). This function applies
  * all the new clones to the BRT table - we load BRT entries and update
  * reference counters. To sync new BRT entries to disk, we use brt_sync()
  * function. This function will sync all dirty per-top-level-vdev BRTs,
  * the entry counters arrays, etc.
  *
  * Block Cloning and ZIL.
  *
  * Every clone operation is divided into chunks (similar to write) and each
  * chunk is cloned in a separate transaction. The chunk size is determined by
  * how many BPs we can fit into a single ZIL entry.
  * Replaying clone operation is different from the regular clone operation,
  * as when we log clone operations we cannot use the source object - it may
  * reside on a different dataset, so we log BPs we want to clone.
  * The ZIL is replayed when we mount the given dataset, not when the pool is
  * imported. Taking this into account it is possible that the pool is imported
  * without mounting datasets and the source dataset is destroyed before the
  * destination dataset is mounted and its ZIL replayed.
  * To address this situation we leverage zil_claim() mechanism where ZFS will
  * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
  * entries, we will bump reference counters for their BPs in the BRT.  Then
  * on mount and ZIL replay we bump the reference counters once more, while the
  * first references are dropped during ZIL destroy by zil_free_clone_range().
  * It is possible that after zil_claim() we never mount the destination, so
  * we never replay its ZIL and just destroy it.  In this case the only taken
  * references will be dropped by zil_free_clone_range(), since the cloning is
  * not going to ever take place.
  */
 
 static kmem_cache_t *brt_entry_cache;
 static kmem_cache_t *brt_pending_entry_cache;
 
 /*
  * Enable/disable prefetching of BRT entries that we are going to modify.
  */
 int zfs_brt_prefetch = 1;
 
 #ifdef ZFS_DEBUG
 #define	BRT_DEBUG(...)	do {						\
 	if ((zfs_flags & ZFS_DEBUG_BRT) != 0) {				\
 		__dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
 	}								\
 } while (0)
 #else
 #define	BRT_DEBUG(...)	do { } while (0)
 #endif
 
 int brt_zap_leaf_blockshift = 12;
 int brt_zap_indirect_blockshift = 12;
 
 static kstat_t	*brt_ksp;
 
 typedef struct brt_stats {
 	kstat_named_t brt_addref_entry_in_memory;
 	kstat_named_t brt_addref_entry_not_on_disk;
 	kstat_named_t brt_addref_entry_on_disk;
 	kstat_named_t brt_addref_entry_read_lost_race;
 	kstat_named_t brt_decref_entry_in_memory;
 	kstat_named_t brt_decref_entry_loaded_from_disk;
 	kstat_named_t brt_decref_entry_not_in_memory;
 	kstat_named_t brt_decref_entry_not_on_disk;
 	kstat_named_t brt_decref_entry_read_lost_race;
 	kstat_named_t brt_decref_entry_still_referenced;
 	kstat_named_t brt_decref_free_data_later;
 	kstat_named_t brt_decref_free_data_now;
 	kstat_named_t brt_decref_no_entry;
 } brt_stats_t;
 
 static brt_stats_t brt_stats = {
 	{ "addref_entry_in_memory",		KSTAT_DATA_UINT64 },
 	{ "addref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
 	{ "addref_entry_on_disk",		KSTAT_DATA_UINT64 },
 	{ "addref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
 	{ "decref_entry_in_memory",		KSTAT_DATA_UINT64 },
 	{ "decref_entry_loaded_from_disk",	KSTAT_DATA_UINT64 },
 	{ "decref_entry_not_in_memory",		KSTAT_DATA_UINT64 },
 	{ "decref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
 	{ "decref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
 	{ "decref_entry_still_referenced",	KSTAT_DATA_UINT64 },
 	{ "decref_free_data_later",		KSTAT_DATA_UINT64 },
 	{ "decref_free_data_now",		KSTAT_DATA_UINT64 },
 	{ "decref_no_entry",			KSTAT_DATA_UINT64 }
 };
 
 struct {
 	wmsum_t brt_addref_entry_in_memory;
 	wmsum_t brt_addref_entry_not_on_disk;
 	wmsum_t brt_addref_entry_on_disk;
 	wmsum_t brt_addref_entry_read_lost_race;
 	wmsum_t brt_decref_entry_in_memory;
 	wmsum_t brt_decref_entry_loaded_from_disk;
 	wmsum_t brt_decref_entry_not_in_memory;
 	wmsum_t brt_decref_entry_not_on_disk;
 	wmsum_t brt_decref_entry_read_lost_race;
 	wmsum_t brt_decref_entry_still_referenced;
 	wmsum_t brt_decref_free_data_later;
 	wmsum_t brt_decref_free_data_now;
 	wmsum_t brt_decref_no_entry;
 } brt_sums;
 
 #define	BRTSTAT_BUMP(stat)	wmsum_add(&brt_sums.stat, 1)
 
 static int brt_entry_compare(const void *x1, const void *x2);
 static int brt_pending_entry_compare(const void *x1, const void *x2);
 
 static void
 brt_rlock(brt_t *brt)
 {
 	rw_enter(&brt->brt_lock, RW_READER);
 }
 
 static void
 brt_wlock(brt_t *brt)
 {
 	rw_enter(&brt->brt_lock, RW_WRITER);
 }
 
 static void
 brt_unlock(brt_t *brt)
 {
 	rw_exit(&brt->brt_lock);
 }
 
 static uint16_t
 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
 {
 
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	if (unlikely(brtvd->bv_need_byteswap)) {
 		return (BSWAP_16(brtvd->bv_entcount[idx]));
 	} else {
 		return (brtvd->bv_entcount[idx]);
 	}
 }
 
 static void
 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
 {
 
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	if (unlikely(brtvd->bv_need_byteswap)) {
 		brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
 	} else {
 		brtvd->bv_entcount[idx] = entcnt;
 	}
 }
 
 static void
 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
 {
 	uint16_t entcnt;
 
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	entcnt = brt_vdev_entcount_get(brtvd, idx);
 	ASSERT(entcnt < UINT16_MAX);
 
 	brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
 }
 
 static void
 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
 {
 	uint16_t entcnt;
 
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	entcnt = brt_vdev_entcount_get(brtvd, idx);
 	ASSERT(entcnt > 0);
 
 	brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
 }
 
 #ifdef ZFS_DEBUG
 static void
 brt_vdev_dump(brt_vdev_t *brtvd)
 {
 	uint64_t idx;
 
 	zfs_dbgmsg("  BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
 	    "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
 	    (u_longlong_t)brtvd->bv_vdevid,
 	    brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
 	    (u_longlong_t)brtvd->bv_size,
 	    (u_longlong_t)brtvd->bv_totalcount,
 	    (u_longlong_t)brtvd->bv_nblocks,
 	    (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
 	if (brtvd->bv_totalcount > 0) {
 		zfs_dbgmsg("    entcounts:");
 		for (idx = 0; idx < brtvd->bv_size; idx++) {
 			uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
 			if (entcnt > 0) {
 				zfs_dbgmsg("      [%04llu] %hu",
 				    (u_longlong_t)idx, entcnt);
 			}
 		}
 	}
 	if (brtvd->bv_entcount_dirty) {
 		char *bitmap;
 
 		bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
 		for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
 			bitmap[idx] =
 			    BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
 		}
 		bitmap[idx] = '\0';
 		zfs_dbgmsg("    dirty: %s", bitmap);
 		kmem_free(bitmap, brtvd->bv_nblocks + 1);
 	}
 }
 #endif
 
 static brt_vdev_t *
 brt_vdev(brt_t *brt, uint64_t vdevid)
 {
 	brt_vdev_t *brtvd;
 
 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
 
 	if (vdevid < brt->brt_nvdevs) {
 		brtvd = &brt->brt_vdevs[vdevid];
 	} else {
 		brtvd = NULL;
 	}
 
 	return (brtvd);
 }
 
 static void
 brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
 {
 	char name[64];
 
 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 	ASSERT0(brtvd->bv_mos_brtvdev);
 	ASSERT0(brtvd->bv_mos_entries);
 	ASSERT(brtvd->bv_entcount != NULL);
 	ASSERT(brtvd->bv_size > 0);
 	ASSERT(brtvd->bv_bitmap != NULL);
 	ASSERT(brtvd->bv_nblocks > 0);
 
 	brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
 	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
 	    brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE,
 	    0, tx);
 	VERIFY(brtvd->bv_mos_entries != 0);
 	BRT_DEBUG("MOS entries created, object=%llu",
 	    (u_longlong_t)brtvd->bv_mos_entries);
 
 	/*
 	 * We allocate DMU buffer to store the bv_entcount[] array.
 	 * We will keep array size (bv_size) and cummulative count for all
 	 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
 	 */
 	brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos,
 	    DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
 	    DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
 	VERIFY(brtvd->bv_mos_brtvdev != 0);
 	BRT_DEBUG("MOS BRT VDEV created, object=%llu",
 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
 
 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
 	    (u_longlong_t)brtvd->bv_vdevid);
 	VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
 	BRT_DEBUG("Pool directory object created, object=%s", name);
 
 	spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
 }
 
 static void
 brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
 {
 	vdev_t *vd;
 	uint16_t *entcount;
 	ulong_t *bitmap;
 	uint64_t nblocks, size;
 
 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 
 	spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER);
 	vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid);
 	size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
 	spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
 
 	entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
 	nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
 	bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
 
 	if (!brtvd->bv_initiated) {
 		ASSERT0(brtvd->bv_size);
 		ASSERT(brtvd->bv_entcount == NULL);
 		ASSERT(brtvd->bv_bitmap == NULL);
 		ASSERT0(brtvd->bv_nblocks);
 
 		avl_create(&brtvd->bv_tree, brt_entry_compare,
 		    sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
 	} else {
 		ASSERT(brtvd->bv_size > 0);
 		ASSERT(brtvd->bv_entcount != NULL);
 		ASSERT(brtvd->bv_bitmap != NULL);
 		ASSERT(brtvd->bv_nblocks > 0);
 		/*
 		 * TODO: Allow vdev shrinking. We only need to implement
 		 * shrinking the on-disk BRT VDEV object.
 		 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
 		 *     size, tx);
 		 */
 		ASSERT3U(brtvd->bv_size, <=, size);
 
 		memcpy(entcount, brtvd->bv_entcount,
 		    sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
 		memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
 		    BT_SIZEOFMAP(brtvd->bv_nblocks)));
 		vmem_free(brtvd->bv_entcount,
 		    sizeof (entcount[0]) * brtvd->bv_size);
 		kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
 	}
 
 	brtvd->bv_size = size;
 	brtvd->bv_entcount = entcount;
 	brtvd->bv_bitmap = bitmap;
 	brtvd->bv_nblocks = nblocks;
 	if (!brtvd->bv_initiated) {
 		brtvd->bv_need_byteswap = FALSE;
 		brtvd->bv_initiated = TRUE;
 		BRT_DEBUG("BRT VDEV %llu initiated.",
 		    (u_longlong_t)brtvd->bv_vdevid);
 	}
 }
 
 static void
 brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
 {
 	char name[64];
 	dmu_buf_t *db;
 	brt_vdev_phys_t *bvphys;
 	int error;
 
 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
 	    (u_longlong_t)brtvd->bv_vdevid);
 	error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
 	if (error != 0)
 		return;
 	ASSERT(brtvd->bv_mos_brtvdev != 0);
 
 	error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db);
 	ASSERT0(error);
 	if (error != 0)
 		return;
 
 	bvphys = db->db_data;
 	if (brt->brt_rangesize == 0) {
 		brt->brt_rangesize = bvphys->bvp_rangesize;
 	} else {
 		ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize);
 	}
 
 	ASSERT(!brtvd->bv_initiated);
 	brt_vdev_realloc(brt, brtvd);
 
 	/* TODO: We don't support VDEV shrinking. */
 	ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
 
 	/*
 	 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
 	 */
 	error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
 	    MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
 	    brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
 	ASSERT0(error);
 
 	brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
 	ASSERT(brtvd->bv_mos_entries != 0);
 	brtvd->bv_need_byteswap =
 	    (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
 	brtvd->bv_totalcount = bvphys->bvp_totalcount;
 	brtvd->bv_usedspace = bvphys->bvp_usedspace;
 	brtvd->bv_savedspace = bvphys->bvp_savedspace;
 	brt->brt_usedspace += brtvd->bv_usedspace;
 	brt->brt_savedspace += brtvd->bv_savedspace;
 
 	dmu_buf_rele(db, FTAG);
 
 	BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
 	    name, (u_longlong_t)brtvd->bv_mos_brtvdev,
 	    (u_longlong_t)brtvd->bv_mos_entries);
 }
 
 static void
 brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
 {
 
 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 	ASSERT(brtvd->bv_initiated);
 
 	vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
 	brtvd->bv_entcount = NULL;
 	kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
 	brtvd->bv_bitmap = NULL;
 	ASSERT0(avl_numnodes(&brtvd->bv_tree));
 	avl_destroy(&brtvd->bv_tree);
 
 	brtvd->bv_size = 0;
 	brtvd->bv_nblocks = 0;
 
 	brtvd->bv_initiated = FALSE;
 	BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
 }
 
 static void
 brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
 {
 	char name[64];
 	uint64_t count;
 	dmu_buf_t *db;
 	brt_vdev_phys_t *bvphys;
 
 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 	ASSERT(brtvd->bv_mos_brtvdev != 0);
 	ASSERT(brtvd->bv_mos_entries != 0);
 
 	VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count));
 	VERIFY0(count);
 	VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx));
 	BRT_DEBUG("MOS entries destroyed, object=%llu",
 	    (u_longlong_t)brtvd->bv_mos_entries);
 	brtvd->bv_mos_entries = 0;
 
 	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
 	bvphys = db->db_data;
 	ASSERT0(bvphys->bvp_totalcount);
 	ASSERT0(bvphys->bvp_usedspace);
 	ASSERT0(bvphys->bvp_savedspace);
 	dmu_buf_rele(db, FTAG);
 
 	VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx));
 	BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
 	brtvd->bv_mos_brtvdev = 0;
 
 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
 	    (u_longlong_t)brtvd->bv_vdevid);
 	VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx));
 	BRT_DEBUG("Pool directory object removed, object=%s", name);
 
 	brt_vdev_dealloc(brt, brtvd);
 
 	spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
 }
 
 static void
 brt_vdevs_expand(brt_t *brt, uint64_t nvdevs)
 {
 	brt_vdev_t *brtvd, *vdevs;
 	uint64_t vdevid;
 
 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 	ASSERT3U(nvdevs, >, brt->brt_nvdevs);
 
 	vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP);
 	if (brt->brt_nvdevs > 0) {
 		ASSERT(brt->brt_vdevs != NULL);
 
 		memcpy(vdevs, brt->brt_vdevs,
 		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
 		kmem_free(brt->brt_vdevs,
 		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
 	}
 	for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) {
 		brtvd = &vdevs[vdevid];
 
 		brtvd->bv_vdevid = vdevid;
 		brtvd->bv_initiated = FALSE;
 	}
 
 	BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
 	    (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs);
 
 	brt->brt_vdevs = vdevs;
 	brt->brt_nvdevs = nvdevs;
 }
 
 static boolean_t
 brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
 {
 	uint64_t idx;
 
 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
 
 	idx = bre->bre_offset / brt->brt_rangesize;
 	if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) {
 		/* VDEV wasn't expanded. */
 		return (brt_vdev_entcount_get(brtvd, idx) > 0);
 	}
 
 	return (FALSE);
 }
 
 static void
 brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
     uint64_t dsize)
 {
 	uint64_t idx;
 
 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
 	ASSERT(brtvd != NULL);
 	ASSERT(brtvd->bv_entcount != NULL);
 
 	brt->brt_savedspace += dsize;
 	brtvd->bv_savedspace += dsize;
 	brtvd->bv_meta_dirty = TRUE;
 
 	if (bre->bre_refcount > 1) {
 		return;
 	}
 
 	brt->brt_usedspace += dsize;
 	brtvd->bv_usedspace += dsize;
 
 	idx = bre->bre_offset / brt->brt_rangesize;
 	if (idx >= brtvd->bv_size) {
 		/* VDEV has been expanded. */
 		brt_vdev_realloc(brt, brtvd);
 	}
 
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	brtvd->bv_totalcount++;
 	brt_vdev_entcount_inc(brtvd, idx);
 	brtvd->bv_entcount_dirty = TRUE;
 	idx = idx / BRT_BLOCKSIZE / 8;
 	BT_SET(brtvd->bv_bitmap, idx);
 
 #ifdef ZFS_DEBUG
 	if (zfs_flags & ZFS_DEBUG_BRT)
 		brt_vdev_dump(brtvd);
 #endif
 }
 
 static void
 brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
     uint64_t dsize)
 {
 	uint64_t idx;
 
 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 	ASSERT(brtvd != NULL);
 	ASSERT(brtvd->bv_entcount != NULL);
 
 	brt->brt_savedspace -= dsize;
 	brtvd->bv_savedspace -= dsize;
 	brtvd->bv_meta_dirty = TRUE;
 
 	if (bre->bre_refcount > 0) {
 		return;
 	}
 
 	brt->brt_usedspace -= dsize;
 	brtvd->bv_usedspace -= dsize;
 
 	idx = bre->bre_offset / brt->brt_rangesize;
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	ASSERT(brtvd->bv_totalcount > 0);
 	brtvd->bv_totalcount--;
 	brt_vdev_entcount_dec(brtvd, idx);
 	brtvd->bv_entcount_dirty = TRUE;
 	idx = idx / BRT_BLOCKSIZE / 8;
 	BT_SET(brtvd->bv_bitmap, idx);
 
 #ifdef ZFS_DEBUG
 	if (zfs_flags & ZFS_DEBUG_BRT)
 		brt_vdev_dump(brtvd);
 #endif
 }
 
 static void
 brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 	brt_vdev_phys_t *bvphys;
 
 	ASSERT(brtvd->bv_meta_dirty);
 	ASSERT(brtvd->bv_mos_brtvdev != 0);
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
 
 	if (brtvd->bv_entcount_dirty) {
 		/*
 		 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
 		 */
 		dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
 		    brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
 		    brtvd->bv_entcount, tx);
 		memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks));
 		brtvd->bv_entcount_dirty = FALSE;
 	}
 
 	dmu_buf_will_dirty(db, tx);
 	bvphys = db->db_data;
 	bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
 	bvphys->bvp_size = brtvd->bv_size;
 	if (brtvd->bv_need_byteswap) {
 		bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
 	} else {
 		bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
 	}
 	bvphys->bvp_totalcount = brtvd->bv_totalcount;
 	bvphys->bvp_rangesize = brt->brt_rangesize;
 	bvphys->bvp_usedspace = brtvd->bv_usedspace;
 	bvphys->bvp_savedspace = brtvd->bv_savedspace;
 	dmu_buf_rele(db, FTAG);
 
 	brtvd->bv_meta_dirty = FALSE;
 }
 
 static void
 brt_vdevs_alloc(brt_t *brt, boolean_t load)
 {
 	brt_vdev_t *brtvd;
 	uint64_t vdevid;
 
 	brt_wlock(brt);
 
 	brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children);
 
 	if (load) {
 		for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
 			brtvd = &brt->brt_vdevs[vdevid];
 			ASSERT(brtvd->bv_entcount == NULL);
 
 			brt_vdev_load(brt, brtvd);
 		}
 	}
 
 	if (brt->brt_rangesize == 0) {
 		brt->brt_rangesize = BRT_RANGESIZE;
 	}
 
 	brt_unlock(brt);
 }
 
 static void
 brt_vdevs_free(brt_t *brt)
 {
 	brt_vdev_t *brtvd;
 	uint64_t vdevid;
 
 	brt_wlock(brt);
 
 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
 		brtvd = &brt->brt_vdevs[vdevid];
 		if (brtvd->bv_initiated)
 			brt_vdev_dealloc(brt, brtvd);
 	}
 	kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs);
 
 	brt_unlock(brt);
 }
 
 static void
 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
 {
 
 	bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
 	bre->bre_refcount = 0;
 
 	*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
 }
 
 static int
 brt_entry_compare(const void *x1, const void *x2)
 {
 	const brt_entry_t *bre1 = x1;
 	const brt_entry_t *bre2 = x2;
 
 	return (TREE_CMP(bre1->bre_offset, bre2->bre_offset));
 }
 
 static int
 brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
 {
 	uint64_t mos_entries;
 	uint64_t one, physsize;
 	int error;
 
 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
 
 	if (!brt_vdev_lookup(brt, brtvd, bre))
 		return (SET_ERROR(ENOENT));
 
 	/*
 	 * Remember mos_entries object number. After we reacquire the BRT lock,
 	 * the brtvd pointer may be invalid.
 	 */
 	mos_entries = brtvd->bv_mos_entries;
 	if (mos_entries == 0)
 		return (SET_ERROR(ENOENT));
 
 	brt_unlock(brt);
 
 	error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
 	    BRT_KEY_WORDS, &one, &physsize);
 	if (error == 0) {
 		ASSERT3U(one, ==, 1);
 		ASSERT3U(physsize, ==, sizeof (bre->bre_refcount));
 
 		error = zap_lookup_uint64(brt->brt_mos, mos_entries,
 		    &bre->bre_offset, BRT_KEY_WORDS, 1,
 		    sizeof (bre->bre_refcount), &bre->bre_refcount);
 		BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
 		    "count=%llu error=%d", (u_longlong_t)mos_entries,
 		    (u_longlong_t)brtvd->bv_vdevid,
 		    (u_longlong_t)bre->bre_offset,
 		    error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error);
 	}
 
 	brt_wlock(brt);
 
 	return (error);
 }
 
 static void
 brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
 {
 	brt_vdev_t *brtvd;
 	uint64_t mos_entries = 0;
 
 	brt_rlock(brt);
 	brtvd = brt_vdev(brt, vdevid);
 	if (brtvd != NULL)
 		mos_entries = brtvd->bv_mos_entries;
 	brt_unlock(brt);
 
 	if (mos_entries == 0)
 		return;
 
 	(void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
 }
 
 /*
  * Return TRUE if we _can_ have BRT entry for this bp. It might be false
  * positive, but gives us quick answer if we should look into BRT, which
  * may require reads and thus will be more expensive.
  */
 boolean_t
 brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
 {
 	brt_t *brt = spa->spa_brt;
 	brt_vdev_t *brtvd;
 	brt_entry_t bre_search;
 	boolean_t mayexists = FALSE;
 	uint64_t vdevid;
 
 	brt_entry_fill(bp, &bre_search, &vdevid);
 
 	brt_rlock(brt);
 
 	brtvd = brt_vdev(brt, vdevid);
 	if (brtvd != NULL && brtvd->bv_initiated) {
 		if (!avl_is_empty(&brtvd->bv_tree) ||
 		    brt_vdev_lookup(brt, brtvd, &bre_search)) {
 			mayexists = TRUE;
 		}
 	}
 
 	brt_unlock(brt);
 
 	return (mayexists);
 }
 
 uint64_t
 brt_get_dspace(spa_t *spa)
 {
 	brt_t *brt = spa->spa_brt;
 
 	if (brt == NULL)
 		return (0);
 
 	return (brt->brt_savedspace);
 }
 
 uint64_t
 brt_get_used(spa_t *spa)
 {
 	brt_t *brt = spa->spa_brt;
 
 	if (brt == NULL)
 		return (0);
 
 	return (brt->brt_usedspace);
 }
 
 uint64_t
 brt_get_saved(spa_t *spa)
 {
 	brt_t *brt = spa->spa_brt;
 
 	if (brt == NULL)
 		return (0);
 
 	return (brt->brt_savedspace);
 }
 
 uint64_t
 brt_get_ratio(spa_t *spa)
 {
 	brt_t *brt = spa->spa_brt;
 
 	if (brt->brt_usedspace == 0)
 		return (100);
 
 	return ((brt->brt_usedspace + brt->brt_savedspace) * 100 /
 	    brt->brt_usedspace);
 }
 
 static int
 brt_kstats_update(kstat_t *ksp, int rw)
 {
 	brt_stats_t *bs = ksp->ks_data;
 
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
 	bs->brt_addref_entry_in_memory.value.ui64 =
 	    wmsum_value(&brt_sums.brt_addref_entry_in_memory);
 	bs->brt_addref_entry_not_on_disk.value.ui64 =
 	    wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
 	bs->brt_addref_entry_on_disk.value.ui64 =
 	    wmsum_value(&brt_sums.brt_addref_entry_on_disk);
 	bs->brt_addref_entry_read_lost_race.value.ui64 =
 	    wmsum_value(&brt_sums.brt_addref_entry_read_lost_race);
 	bs->brt_decref_entry_in_memory.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_entry_in_memory);
 	bs->brt_decref_entry_loaded_from_disk.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
 	bs->brt_decref_entry_not_in_memory.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
 	bs->brt_decref_entry_not_on_disk.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_entry_not_on_disk);
 	bs->brt_decref_entry_read_lost_race.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
 	bs->brt_decref_entry_still_referenced.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
 	bs->brt_decref_free_data_later.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_free_data_later);
 	bs->brt_decref_free_data_now.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_free_data_now);
 	bs->brt_decref_no_entry.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_no_entry);
 
 	return (0);
 }
 
 static void
 brt_stat_init(void)
 {
 
 	wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0);
 	wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
 	wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
 	wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0);
 	wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
 	wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
 	wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
 	wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0);
 	wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
 	wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
 	wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
 	wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
 	wmsum_init(&brt_sums.brt_decref_no_entry, 0);
 
 	brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 	if (brt_ksp != NULL) {
 		brt_ksp->ks_data = &brt_stats;
 		brt_ksp->ks_update = brt_kstats_update;
 		kstat_install(brt_ksp);
 	}
 }
 
 static void
 brt_stat_fini(void)
 {
 	if (brt_ksp != NULL) {
 		kstat_delete(brt_ksp);
 		brt_ksp = NULL;
 	}
 
 	wmsum_fini(&brt_sums.brt_addref_entry_in_memory);
 	wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
 	wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
 	wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race);
 	wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
 	wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
 	wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
 	wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk);
 	wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
 	wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
 	wmsum_fini(&brt_sums.brt_decref_free_data_later);
 	wmsum_fini(&brt_sums.brt_decref_free_data_now);
 	wmsum_fini(&brt_sums.brt_decref_no_entry);
 }
 
 void
 brt_init(void)
 {
 	brt_entry_cache = kmem_cache_create("brt_entry_cache",
 	    sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache",
 	    sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	brt_stat_init();
 }
 
 void
 brt_fini(void)
 {
 	brt_stat_fini();
 
 	kmem_cache_destroy(brt_entry_cache);
 	kmem_cache_destroy(brt_pending_entry_cache);
 }
 
 static brt_entry_t *
 brt_entry_alloc(const brt_entry_t *bre_init)
 {
 	brt_entry_t *bre;
 
 	bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
 	bre->bre_offset = bre_init->bre_offset;
 	bre->bre_refcount = bre_init->bre_refcount;
 
 	return (bre);
 }
 
 static void
 brt_entry_free(brt_entry_t *bre)
 {
 
 	kmem_cache_free(brt_entry_cache, bre);
 }
 
 static void
 brt_entry_addref(brt_t *brt, const blkptr_t *bp)
 {
 	brt_vdev_t *brtvd;
 	brt_entry_t *bre, *racebre;
 	brt_entry_t bre_search;
 	avl_index_t where;
 	uint64_t vdevid;
 	int error;
 
 	ASSERT(!RW_WRITE_HELD(&brt->brt_lock));
 
 	brt_entry_fill(bp, &bre_search, &vdevid);
 
 	brt_wlock(brt);
 
 	brtvd = brt_vdev(brt, vdevid);
 	if (brtvd == NULL) {
 		ASSERT3U(vdevid, >=, brt->brt_nvdevs);
 
 		/* New VDEV was added. */
 		brt_vdevs_expand(brt, vdevid + 1);
 		brtvd = brt_vdev(brt, vdevid);
 	}
 	ASSERT(brtvd != NULL);
 	if (!brtvd->bv_initiated)
 		brt_vdev_realloc(brt, brtvd);
 
 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
 	if (bre != NULL) {
 		BRTSTAT_BUMP(brt_addref_entry_in_memory);
 	} else {
 		/*
 		 * brt_entry_lookup() may drop the BRT (read) lock and
 		 * reacquire it (write).
 		 */
 		error = brt_entry_lookup(brt, brtvd, &bre_search);
 		/* bre_search now contains correct bre_refcount */
 		ASSERT(error == 0 || error == ENOENT);
 		if (error == 0)
 			BRTSTAT_BUMP(brt_addref_entry_on_disk);
 		else
 			BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
 		/*
 		 * When the BRT lock was dropped, brt_vdevs[] may have been
 		 * expanded and reallocated, we need to update brtvd's pointer.
 		 */
 		brtvd = brt_vdev(brt, vdevid);
 		ASSERT(brtvd != NULL);
 
 		racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
 		if (racebre == NULL) {
 			bre = brt_entry_alloc(&bre_search);
 			ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 			avl_insert(&brtvd->bv_tree, bre, where);
 			brt->brt_nentries++;
 		} else {
 			/*
 			 * The entry was added when the BRT lock was dropped in
 			 * brt_entry_lookup().
 			 */
 			BRTSTAT_BUMP(brt_addref_entry_read_lost_race);
 			bre = racebre;
 		}
 	}
 	bre->bre_refcount++;
 	brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
 
 	brt_unlock(brt);
 }
 
 /* Return TRUE if block should be freed immediately. */
 boolean_t
 brt_entry_decref(spa_t *spa, const blkptr_t *bp)
 {
 	brt_t *brt = spa->spa_brt;
 	brt_vdev_t *brtvd;
 	brt_entry_t *bre, *racebre;
 	brt_entry_t bre_search;
 	avl_index_t where;
 	uint64_t vdevid;
 	int error;
 
 	brt_entry_fill(bp, &bre_search, &vdevid);
 
 	brt_wlock(brt);
 
 	brtvd = brt_vdev(brt, vdevid);
 	ASSERT(brtvd != NULL);
 
 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
 	if (bre != NULL) {
 		BRTSTAT_BUMP(brt_decref_entry_in_memory);
 		goto out;
 	} else {
 		BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
 	}
 
 	/*
 	 * brt_entry_lookup() may drop the BRT lock and reacquire it.
 	 */
 	error = brt_entry_lookup(brt, brtvd, &bre_search);
 	/* bre_search now contains correct bre_refcount */
 	ASSERT(error == 0 || error == ENOENT);
 	/*
 	 * When the BRT lock was dropped, brt_vdevs[] may have been expanded
 	 * and reallocated, we need to update brtvd's pointer.
 	 */
 	brtvd = brt_vdev(brt, vdevid);
 	ASSERT(brtvd != NULL);
 
 	if (error == ENOENT) {
 		BRTSTAT_BUMP(brt_decref_entry_not_on_disk);
 		bre = NULL;
 		goto out;
 	}
 
 	racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
 	if (racebre != NULL) {
 		/*
 		 * The entry was added when the BRT lock was dropped in
 		 * brt_entry_lookup().
 		 */
 		BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
 		bre = racebre;
 		goto out;
 	}
 
 	BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
 	bre = brt_entry_alloc(&bre_search);
 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 	avl_insert(&brtvd->bv_tree, bre, where);
 	brt->brt_nentries++;
 
 out:
 	if (bre == NULL) {
 		/*
 		 * This is a free of a regular (not cloned) block.
 		 */
 		brt_unlock(brt);
 		BRTSTAT_BUMP(brt_decref_no_entry);
 		return (B_TRUE);
 	}
 	if (bre->bre_refcount == 0) {
 		brt_unlock(brt);
 		BRTSTAT_BUMP(brt_decref_free_data_now);
 		return (B_TRUE);
 	}
 
 	ASSERT(bre->bre_refcount > 0);
 	bre->bre_refcount--;
 	if (bre->bre_refcount == 0)
 		BRTSTAT_BUMP(brt_decref_free_data_later);
 	else
 		BRTSTAT_BUMP(brt_decref_entry_still_referenced);
 	brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
 
 	brt_unlock(brt);
 
 	return (B_FALSE);
 }
 
 uint64_t
 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
 {
 	brt_t *brt = spa->spa_brt;
 	brt_vdev_t *brtvd;
 	brt_entry_t bre_search, *bre;
 	uint64_t vdevid, refcnt;
 	int error;
 
 	brt_entry_fill(bp, &bre_search, &vdevid);
 
 	brt_rlock(brt);
 
 	brtvd = brt_vdev(brt, vdevid);
 	ASSERT(brtvd != NULL);
 
 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
 	if (bre == NULL) {
 		error = brt_entry_lookup(brt, brtvd, &bre_search);
 		ASSERT(error == 0 || error == ENOENT);
 		if (error == ENOENT)
 			refcnt = 0;
 		else
 			refcnt = bre_search.bre_refcount;
 	} else
 		refcnt = bre->bre_refcount;
 
 	brt_unlock(brt);
 	return (refcnt);
 }
 
 static void
 brt_prefetch(brt_t *brt, const blkptr_t *bp)
 {
 	brt_entry_t bre;
 	uint64_t vdevid;
 
 	ASSERT(bp != NULL);
 
 	if (!zfs_brt_prefetch)
 		return;
 
 	brt_entry_fill(bp, &bre, &vdevid);
 
 	brt_entry_prefetch(brt, vdevid, &bre);
 }
 
 static int
 brt_pending_entry_compare(const void *x1, const void *x2)
 {
 	const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2;
 	const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
 	int cmp;
 
 	cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
 	    DVA_GET_VDEV(&bp2->blk_dva[0]));
 	if (cmp == 0) {
 		cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
 		    DVA_GET_OFFSET(&bp2->blk_dva[0]));
 		if (unlikely(cmp == 0)) {
-			cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1),
-			    BP_PHYSICAL_BIRTH(bp2));
+			cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2));
 		}
 	}
 
 	return (cmp);
 }
 
 void
 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	brt_t *brt;
 	avl_tree_t *pending_tree;
 	kmutex_t *pending_lock;
 	brt_pending_entry_t *bpe, *newbpe;
 	avl_index_t where;
 	uint64_t txg;
 
 	brt = spa->spa_brt;
 	txg = dmu_tx_get_txg(tx);
 	ASSERT3U(txg, !=, 0);
 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
 
 	newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP);
 	newbpe->bpe_bp = *bp;
 	newbpe->bpe_count = 1;
 
 	mutex_enter(pending_lock);
 
 	bpe = avl_find(pending_tree, newbpe, &where);
 	if (bpe == NULL) {
 		avl_insert(pending_tree, newbpe, where);
 		newbpe = NULL;
 	} else {
 		bpe->bpe_count++;
 	}
 
 	mutex_exit(pending_lock);
 
 	if (newbpe != NULL) {
 		ASSERT(bpe != NULL);
 		ASSERT(bpe != newbpe);
 		kmem_cache_free(brt_pending_entry_cache, newbpe);
 	} else {
 		ASSERT(bpe == NULL);
 
 		/* Prefetch BRT entry for the syncing context. */
 		brt_prefetch(brt, bp);
 	}
 }
 
 void
 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	brt_t *brt;
 	avl_tree_t *pending_tree;
 	kmutex_t *pending_lock;
 	brt_pending_entry_t *bpe, bpe_search;
 	uint64_t txg;
 
 	brt = spa->spa_brt;
 	txg = dmu_tx_get_txg(tx);
 	ASSERT3U(txg, !=, 0);
 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
 
 	bpe_search.bpe_bp = *bp;
 
 	mutex_enter(pending_lock);
 
 	bpe = avl_find(pending_tree, &bpe_search, NULL);
 	/* I believe we should always find bpe when this function is called. */
 	if (bpe != NULL) {
 		ASSERT(bpe->bpe_count > 0);
 
 		bpe->bpe_count--;
 		if (bpe->bpe_count == 0) {
 			avl_remove(pending_tree, bpe);
 			kmem_cache_free(brt_pending_entry_cache, bpe);
 		}
 	}
 
 	mutex_exit(pending_lock);
 }
 
 void
 brt_pending_apply(spa_t *spa, uint64_t txg)
 {
 	brt_t *brt = spa->spa_brt;
 	brt_pending_entry_t *bpe;
 	avl_tree_t *pending_tree;
 	void *c;
 
 	ASSERT3U(txg, !=, 0);
 
 	/*
 	 * We are in syncing context, so no other brt_pending_tree accesses
 	 * are possible for the TXG. Don't need to acquire brt_pending_lock.
 	 */
 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
 
 	c = NULL;
 	while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
 		boolean_t added_to_ddt;
 
 		for (int i = 0; i < bpe->bpe_count; i++) {
 			/*
 			 * If the block has DEDUP bit set, it means that it
 			 * already exists in the DEDUP table, so we can just
 			 * use that instead of creating new entry in
 			 * the BRT table.
 			 */
 			if (BP_GET_DEDUP(&bpe->bpe_bp)) {
 				added_to_ddt = ddt_addref(spa, &bpe->bpe_bp);
 			} else {
 				added_to_ddt = B_FALSE;
 			}
 			if (!added_to_ddt)
 				brt_entry_addref(brt, &bpe->bpe_bp);
 		}
 
 		kmem_cache_free(brt_pending_entry_cache, bpe);
 	}
 }
 
 static void
 brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
 {
 	if (bre->bre_refcount == 0) {
 		int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset,
 		    BRT_KEY_WORDS, tx);
 		VERIFY(error == 0 || error == ENOENT);
 	} else {
 		VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset,
 		    BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount),
 		    &bre->bre_refcount, tx));
 	}
 }
 
 static void
 brt_sync_table(brt_t *brt, dmu_tx_t *tx)
 {
 	brt_vdev_t *brtvd;
 	brt_entry_t *bre;
 	dnode_t *dn;
 	uint64_t vdevid;
 	void *c;
 
 	brt_wlock(brt);
 
 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
 		brtvd = &brt->brt_vdevs[vdevid];
 
 		if (!brtvd->bv_initiated)
 			continue;
 
 		if (!brtvd->bv_meta_dirty) {
 			ASSERT(!brtvd->bv_entcount_dirty);
 			ASSERT0(avl_numnodes(&brtvd->bv_tree));
 			continue;
 		}
 
 		ASSERT(!brtvd->bv_entcount_dirty ||
 		    avl_numnodes(&brtvd->bv_tree) != 0);
 
 		if (brtvd->bv_mos_brtvdev == 0)
 			brt_vdev_create(brt, brtvd, tx);
 
 		VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries,
 		    FTAG, &dn));
 
 		c = NULL;
 		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
 			brt_sync_entry(dn, bre, tx);
 			brt_entry_free(bre);
 			ASSERT(brt->brt_nentries > 0);
 			brt->brt_nentries--;
 		}
 
 		dnode_rele(dn, FTAG);
 
 		brt_vdev_sync(brt, brtvd, tx);
 
 		if (brtvd->bv_totalcount == 0)
 			brt_vdev_destroy(brt, brtvd, tx);
 	}
 
 	ASSERT0(brt->brt_nentries);
 
 	brt_unlock(brt);
 }
 
 void
 brt_sync(spa_t *spa, uint64_t txg)
 {
 	dmu_tx_t *tx;
 	brt_t *brt;
 
 	ASSERT(spa_syncing_txg(spa) == txg);
 
 	brt = spa->spa_brt;
 	brt_rlock(brt);
 	if (brt->brt_nentries == 0) {
 		/* No changes. */
 		brt_unlock(brt);
 		return;
 	}
 	brt_unlock(brt);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	brt_sync_table(brt, tx);
 
 	dmu_tx_commit(tx);
 }
 
 static void
 brt_table_alloc(brt_t *brt)
 {
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		avl_create(&brt->brt_pending_tree[i],
 		    brt_pending_entry_compare,
 		    sizeof (brt_pending_entry_t),
 		    offsetof(brt_pending_entry_t, bpe_node));
 		mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT,
 		    NULL);
 	}
 }
 
 static void
 brt_table_free(brt_t *brt)
 {
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		ASSERT(avl_is_empty(&brt->brt_pending_tree[i]));
 
 		avl_destroy(&brt->brt_pending_tree[i]);
 		mutex_destroy(&brt->brt_pending_lock[i]);
 	}
 }
 
 static void
 brt_alloc(spa_t *spa)
 {
 	brt_t *brt;
 
 	ASSERT(spa->spa_brt == NULL);
 
 	brt = kmem_zalloc(sizeof (*brt), KM_SLEEP);
 	rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL);
 	brt->brt_spa = spa;
 	brt->brt_rangesize = 0;
 	brt->brt_nentries = 0;
 	brt->brt_vdevs = NULL;
 	brt->brt_nvdevs = 0;
 	brt_table_alloc(brt);
 
 	spa->spa_brt = brt;
 }
 
 void
 brt_create(spa_t *spa)
 {
 
 	brt_alloc(spa);
 	brt_vdevs_alloc(spa->spa_brt, B_FALSE);
 }
 
 int
 brt_load(spa_t *spa)
 {
 
 	brt_alloc(spa);
 	brt_vdevs_alloc(spa->spa_brt, B_TRUE);
 
 	return (0);
 }
 
 void
 brt_unload(spa_t *spa)
 {
 	brt_t *brt = spa->spa_brt;
 
 	if (brt == NULL)
 		return;
 
 	brt_vdevs_free(brt);
 	brt_table_free(brt);
 	rw_destroy(&brt->brt_lock);
 	kmem_free(brt, sizeof (*brt));
 	spa->spa_brt = NULL;
 }
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW,
     "Enable prefetching of BRT entries");
 #ifdef ZFS_BRT_DEBUG
 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug");
 #endif
 /* END CSTYLED */
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 6798fc2d5bdc..4e190c131e1d 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -1,5265 +1,5267 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/dmu.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_impl.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dmu_tx.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/blkptr.h>
 #include <sys/range_tree.h>
 #include <sys/trace_zfs.h>
 #include <sys/callb.h>
 #include <sys/abd.h>
 #include <sys/brt.h>
 #include <sys/vdev.h>
 #include <cityhash.h>
 #include <sys/spa_impl.h>
 #include <sys/wmsum.h>
 #include <sys/vdev_impl.h>
 
 static kstat_t *dbuf_ksp;
 
 typedef struct dbuf_stats {
 	/*
 	 * Various statistics about the size of the dbuf cache.
 	 */
 	kstat_named_t cache_count;
 	kstat_named_t cache_size_bytes;
 	kstat_named_t cache_size_bytes_max;
 	/*
 	 * Statistics regarding the bounds on the dbuf cache size.
 	 */
 	kstat_named_t cache_target_bytes;
 	kstat_named_t cache_lowater_bytes;
 	kstat_named_t cache_hiwater_bytes;
 	/*
 	 * Total number of dbuf cache evictions that have occurred.
 	 */
 	kstat_named_t cache_total_evicts;
 	/*
 	 * The distribution of dbuf levels in the dbuf cache and
 	 * the total size of all dbufs at each level.
 	 */
 	kstat_named_t cache_levels[DN_MAX_LEVELS];
 	kstat_named_t cache_levels_bytes[DN_MAX_LEVELS];
 	/*
 	 * Statistics about the dbuf hash table.
 	 */
 	kstat_named_t hash_hits;
 	kstat_named_t hash_misses;
 	kstat_named_t hash_collisions;
 	kstat_named_t hash_elements;
 	kstat_named_t hash_elements_max;
 	/*
 	 * Number of sublists containing more than one dbuf in the dbuf
 	 * hash table. Keep track of the longest hash chain.
 	 */
 	kstat_named_t hash_chains;
 	kstat_named_t hash_chain_max;
 	/*
 	 * Number of times a dbuf_create() discovers that a dbuf was
 	 * already created and in the dbuf hash table.
 	 */
 	kstat_named_t hash_insert_race;
 	/*
 	 * Number of entries in the hash table dbuf and mutex arrays.
 	 */
 	kstat_named_t hash_table_count;
 	kstat_named_t hash_mutex_count;
 	/*
 	 * Statistics about the size of the metadata dbuf cache.
 	 */
 	kstat_named_t metadata_cache_count;
 	kstat_named_t metadata_cache_size_bytes;
 	kstat_named_t metadata_cache_size_bytes_max;
 	/*
 	 * For diagnostic purposes, this is incremented whenever we can't add
 	 * something to the metadata cache because it's full, and instead put
 	 * the data in the regular dbuf cache.
 	 */
 	kstat_named_t metadata_cache_overflow;
 } dbuf_stats_t;
 
 dbuf_stats_t dbuf_stats = {
 	{ "cache_count",			KSTAT_DATA_UINT64 },
 	{ "cache_size_bytes",			KSTAT_DATA_UINT64 },
 	{ "cache_size_bytes_max",		KSTAT_DATA_UINT64 },
 	{ "cache_target_bytes",			KSTAT_DATA_UINT64 },
 	{ "cache_lowater_bytes",		KSTAT_DATA_UINT64 },
 	{ "cache_hiwater_bytes",		KSTAT_DATA_UINT64 },
 	{ "cache_total_evicts",			KSTAT_DATA_UINT64 },
 	{ { "cache_levels_N",			KSTAT_DATA_UINT64 } },
 	{ { "cache_levels_bytes_N",		KSTAT_DATA_UINT64 } },
 	{ "hash_hits",				KSTAT_DATA_UINT64 },
 	{ "hash_misses",			KSTAT_DATA_UINT64 },
 	{ "hash_collisions",			KSTAT_DATA_UINT64 },
 	{ "hash_elements",			KSTAT_DATA_UINT64 },
 	{ "hash_elements_max",			KSTAT_DATA_UINT64 },
 	{ "hash_chains",			KSTAT_DATA_UINT64 },
 	{ "hash_chain_max",			KSTAT_DATA_UINT64 },
 	{ "hash_insert_race",			KSTAT_DATA_UINT64 },
 	{ "hash_table_count",			KSTAT_DATA_UINT64 },
 	{ "hash_mutex_count",			KSTAT_DATA_UINT64 },
 	{ "metadata_cache_count",		KSTAT_DATA_UINT64 },
 	{ "metadata_cache_size_bytes",		KSTAT_DATA_UINT64 },
 	{ "metadata_cache_size_bytes_max",	KSTAT_DATA_UINT64 },
 	{ "metadata_cache_overflow",		KSTAT_DATA_UINT64 }
 };
 
 struct {
 	wmsum_t cache_count;
 	wmsum_t cache_total_evicts;
 	wmsum_t cache_levels[DN_MAX_LEVELS];
 	wmsum_t cache_levels_bytes[DN_MAX_LEVELS];
 	wmsum_t hash_hits;
 	wmsum_t hash_misses;
 	wmsum_t hash_collisions;
 	wmsum_t hash_chains;
 	wmsum_t hash_insert_race;
 	wmsum_t metadata_cache_count;
 	wmsum_t metadata_cache_overflow;
 } dbuf_sums;
 
 #define	DBUF_STAT_INCR(stat, val)	\
 	wmsum_add(&dbuf_sums.stat, val);
 #define	DBUF_STAT_DECR(stat, val)	\
 	DBUF_STAT_INCR(stat, -(val));
 #define	DBUF_STAT_BUMP(stat)		\
 	DBUF_STAT_INCR(stat, 1);
 #define	DBUF_STAT_BUMPDOWN(stat)	\
 	DBUF_STAT_INCR(stat, -1);
 #define	DBUF_STAT_MAX(stat, v) {					\
 	uint64_t _m;							\
 	while ((v) > (_m = dbuf_stats.stat.value.ui64) &&		\
 	    (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\
 		continue;						\
 }
 
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
 static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
 
 /*
  * Global data structures and functions for the dbuf cache.
  */
 static kmem_cache_t *dbuf_kmem_cache;
 static taskq_t *dbu_evict_taskq;
 
 static kthread_t *dbuf_cache_evict_thread;
 static kmutex_t dbuf_evict_lock;
 static kcondvar_t dbuf_evict_cv;
 static boolean_t dbuf_evict_thread_exit;
 
 /*
  * There are two dbuf caches; each dbuf can only be in one of them at a time.
  *
  * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
  *    from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
  *    that represent the metadata that describes filesystems/snapshots/
  *    bookmarks/properties/etc. We only evict from this cache when we export a
  *    pool, to short-circuit as much I/O as possible for all administrative
  *    commands that need the metadata. There is no eviction policy for this
  *    cache, because we try to only include types in it which would occupy a
  *    very small amount of space per object but create a large impact on the
  *    performance of these commands. Instead, after it reaches a maximum size
  *    (which should only happen on very small memory systems with a very large
  *    number of filesystem objects), we stop taking new dbufs into the
  *    metadata cache, instead putting them in the normal dbuf cache.
  *
  * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
  *    are not currently held but have been recently released. These dbufs
  *    are not eligible for arc eviction until they are aged out of the cache.
  *    Dbufs that are aged out of the cache will be immediately destroyed and
  *    become eligible for arc eviction.
  *
  * Dbufs are added to these caches once the last hold is released. If a dbuf is
  * later accessed and still exists in the dbuf cache, then it will be removed
  * from the cache and later re-added to the head of the cache.
  *
  * If a given dbuf meets the requirements for the metadata cache, it will go
  * there, otherwise it will be considered for the generic LRU dbuf cache. The
  * caches and the refcounts tracking their sizes are stored in an array indexed
  * by those caches' matching enum values (from dbuf_cached_state_t).
  */
 typedef struct dbuf_cache {
 	multilist_t cache;
 	zfs_refcount_t size ____cacheline_aligned;
 } dbuf_cache_t;
 dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
 
 /* Size limits for the caches */
 static uint64_t dbuf_cache_max_bytes = UINT64_MAX;
 static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX;
 
 /* Set the default sizes of the caches to log2 fraction of arc size */
 static uint_t dbuf_cache_shift = 5;
 static uint_t dbuf_metadata_cache_shift = 6;
 
 /* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
 static uint_t dbuf_mutex_cache_shift = 0;
 
 static unsigned long dbuf_cache_target_bytes(void);
 static unsigned long dbuf_metadata_cache_target_bytes(void);
 
 /*
  * The LRU dbuf cache uses a three-stage eviction policy:
  *	- A low water marker designates when the dbuf eviction thread
  *	should stop evicting from the dbuf cache.
  *	- When we reach the maximum size (aka mid water mark), we
  *	signal the eviction thread to run.
  *	- The high water mark indicates when the eviction thread
  *	is unable to keep up with the incoming load and eviction must
  *	happen in the context of the calling thread.
  *
  * The dbuf cache:
  *                                                 (max size)
  *                                      low water   mid water   hi water
  * +----------------------------------------+----------+----------+
  * |                                        |          |          |
  * |                                        |          |          |
  * |                                        |          |          |
  * |                                        |          |          |
  * +----------------------------------------+----------+----------+
  *                                        stop        signal     evict
  *                                      evicting     eviction   directly
  *                                                    thread
  *
  * The high and low water marks indicate the operating range for the eviction
  * thread. The low water mark is, by default, 90% of the total size of the
  * cache and the high water mark is at 110% (both of these percentages can be
  * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
  * respectively). The eviction thread will try to ensure that the cache remains
  * within this range by waking up every second and checking if the cache is
  * above the low water mark. The thread can also be woken up by callers adding
  * elements into the cache if the cache is larger than the mid water (i.e max
  * cache size). Once the eviction thread is woken up and eviction is required,
  * it will continue evicting buffers until it's able to reduce the cache size
  * to the low water mark. If the cache size continues to grow and hits the high
  * water mark, then callers adding elements to the cache will begin to evict
  * directly from the cache until the cache is no longer above the high water
  * mark.
  */
 
 /*
  * The percentage above and below the maximum cache size.
  */
 static uint_t dbuf_cache_hiwater_pct = 10;
 static uint_t dbuf_cache_lowater_pct = 10;
 
 static int
 dbuf_cons(void *vdb, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	dmu_buf_impl_t *db = vdb;
 	memset(db, 0, sizeof (dmu_buf_impl_t));
 
 	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
 	rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL);
 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
 	multilist_link_init(&db->db_cache_link);
 	zfs_refcount_create(&db->db_holds);
 
 	return (0);
 }
 
 static void
 dbuf_dest(void *vdb, void *unused)
 {
 	(void) unused;
 	dmu_buf_impl_t *db = vdb;
 	mutex_destroy(&db->db_mtx);
 	rw_destroy(&db->db_rwlock);
 	cv_destroy(&db->db_changed);
 	ASSERT(!multilist_link_active(&db->db_cache_link));
 	zfs_refcount_destroy(&db->db_holds);
 }
 
 /*
  * dbuf hash table routines
  */
 static dbuf_hash_table_t dbuf_hash_table;
 
 /*
  * We use Cityhash for this. It's fast, and has good hash properties without
  * requiring any large static buffers.
  */
 static uint64_t
 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
 {
 	return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
 }
 
 #define	DTRACE_SET_STATE(db, why) \
 	DTRACE_PROBE2(dbuf__state_change, dmu_buf_impl_t *, db,	\
 	    const char *, why)
 
 #define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
 	((dbuf)->db.db_object == (obj) &&		\
 	(dbuf)->db_objset == (os) &&			\
 	(dbuf)->db_level == (level) &&			\
 	(dbuf)->db_blkid == (blkid))
 
 dmu_buf_impl_t *
 dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid,
     uint64_t *hash_out)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	uint64_t hv;
 	uint64_t idx;
 	dmu_buf_impl_t *db;
 
 	hv = dbuf_hash(os, obj, level, blkid);
 	idx = hv & h->hash_table_mask;
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
 			mutex_enter(&db->db_mtx);
 			if (db->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (db);
 			}
 			mutex_exit(&db->db_mtx);
 		}
 	}
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	if (hash_out != NULL)
 		*hash_out = hv;
 	return (NULL);
 }
 
 static dmu_buf_impl_t *
 dbuf_find_bonus(objset_t *os, uint64_t object)
 {
 	dnode_t *dn;
 	dmu_buf_impl_t *db = NULL;
 
 	if (dnode_hold(os, object, FTAG, &dn) == 0) {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		if (dn->dn_bonus != NULL) {
 			db = dn->dn_bonus;
 			mutex_enter(&db->db_mtx);
 		}
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_rele(dn, FTAG);
 	}
 	return (db);
 }
 
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
  */
 static dmu_buf_impl_t *
 dbuf_hash_insert(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	objset_t *os = db->db_objset;
 	uint64_t obj = db->db.db_object;
 	int level = db->db_level;
 	uint64_t blkid, idx;
 	dmu_buf_impl_t *dbf;
 	uint32_t i;
 
 	blkid = db->db_blkid;
 	ASSERT3U(dbuf_hash(os, obj, level, blkid), ==, db->db_hash);
 	idx = db->db_hash & h->hash_table_mask;
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
 	    dbf = dbf->db_hash_next, i++) {
 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
 			mutex_enter(&dbf->db_mtx);
 			if (dbf->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (dbf);
 			}
 			mutex_exit(&dbf->db_mtx);
 		}
 	}
 
 	if (i > 0) {
 		DBUF_STAT_BUMP(hash_collisions);
 		if (i == 1)
 			DBUF_STAT_BUMP(hash_chains);
 
 		DBUF_STAT_MAX(hash_chain_max, i);
 	}
 
 	mutex_enter(&db->db_mtx);
 	db->db_hash_next = h->hash_table[idx];
 	h->hash_table[idx] = db;
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64);
 	DBUF_STAT_MAX(hash_elements_max, he);
 
 	return (NULL);
 }
 
 /*
  * This returns whether this dbuf should be stored in the metadata cache, which
  * is based on whether it's from one of the dnode types that store data related
  * to traversing dataset hierarchies.
  */
 static boolean_t
 dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
 {
 	DB_DNODE_ENTER(db);
 	dmu_object_type_t type = DB_DNODE(db)->dn_type;
 	DB_DNODE_EXIT(db);
 
 	/* Check if this dbuf is one of the types we care about */
 	if (DMU_OT_IS_METADATA_CACHED(type)) {
 		/* If we hit this, then we set something up wrong in dmu_ot */
 		ASSERT(DMU_OT_IS_METADATA(type));
 
 		/*
 		 * Sanity check for small-memory systems: don't allocate too
 		 * much memory for this purpose.
 		 */
 		if (zfs_refcount_count(
 		    &dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
 		    dbuf_metadata_cache_target_bytes()) {
 			DBUF_STAT_BUMP(metadata_cache_overflow);
 			return (B_FALSE);
 		}
 
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Remove an entry from the hash table.  It must be in the EVICTING state.
  */
 static void
 dbuf_hash_remove(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	uint64_t idx;
 	dmu_buf_impl_t *dbf, **dbp;
 
 	ASSERT3U(dbuf_hash(db->db_objset, db->db.db_object, db->db_level,
 	    db->db_blkid), ==, db->db_hash);
 	idx = db->db_hash & h->hash_table_mask;
 
 	/*
 	 * We mustn't hold db_mtx to maintain lock ordering:
 	 * DBUF_HASH_MUTEX > db_mtx.
 	 */
 	ASSERT(zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_state == DB_EVICTING);
 	ASSERT(!MUTEX_HELD(&db->db_mtx));
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	dbp = &h->hash_table[idx];
 	while ((dbf = *dbp) != db) {
 		dbp = &dbf->db_hash_next;
 		ASSERT(dbf != NULL);
 	}
 	*dbp = db->db_hash_next;
 	db->db_hash_next = NULL;
 	if (h->hash_table[idx] &&
 	    h->hash_table[idx]->db_hash_next == NULL)
 		DBUF_STAT_BUMPDOWN(hash_chains);
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	atomic_dec_64(&dbuf_stats.hash_elements.value.ui64);
 }
 
 typedef enum {
 	DBVU_EVICTING,
 	DBVU_NOT_EVICTING
 } dbvu_verify_type_t;
 
 static void
 dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
 {
 #ifdef ZFS_DEBUG
 	int64_t holds;
 
 	if (db->db_user == NULL)
 		return;
 
 	/* Only data blocks support the attachment of user data. */
 	ASSERT(db->db_level == 0);
 
 	/* Clients must resolve a dbuf before attaching user data. */
 	ASSERT(db->db.db_data != NULL);
 	ASSERT3U(db->db_state, ==, DB_CACHED);
 
 	holds = zfs_refcount_count(&db->db_holds);
 	if (verify_type == DBVU_EVICTING) {
 		/*
 		 * Immediate eviction occurs when holds == dirtycnt.
 		 * For normal eviction buffers, holds is zero on
 		 * eviction, except when dbuf_fix_old_data() calls
 		 * dbuf_clear_data().  However, the hold count can grow
 		 * during eviction even though db_mtx is held (see
 		 * dmu_bonus_hold() for an example), so we can only
 		 * test the generic invariant that holds >= dirtycnt.
 		 */
 		ASSERT3U(holds, >=, db->db_dirtycnt);
 	} else {
 		if (db->db_user_immediate_evict == TRUE)
 			ASSERT3U(holds, >=, db->db_dirtycnt);
 		else
 			ASSERT3U(holds, >, 0);
 	}
 #endif
 }
 
 static void
 dbuf_evict_user(dmu_buf_impl_t *db)
 {
 	dmu_buf_user_t *dbu = db->db_user;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (dbu == NULL)
 		return;
 
 	dbuf_verify_user(db, DBVU_EVICTING);
 	db->db_user = NULL;
 
 #ifdef ZFS_DEBUG
 	if (dbu->dbu_clear_on_evict_dbufp != NULL)
 		*dbu->dbu_clear_on_evict_dbufp = NULL;
 #endif
 
 	if (db->db_caching_status != DB_NO_CACHE) {
 		/*
 		 * This is a cached dbuf, so the size of the user data is
 		 * included in its cached amount. We adjust it here because the
 		 * user data has already been detached from the dbuf, and the
 		 * sync functions are not supposed to touch it (the dbuf might
 		 * not exist anymore by the time the sync functions run.
 		 */
 		uint64_t size = dbu->dbu_size;
 		(void) zfs_refcount_remove_many(
 		    &dbuf_caches[db->db_caching_status].size, size, db);
 		if (db->db_caching_status == DB_DBUF_CACHE)
 			DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
 	}
 
 	/*
 	 * There are two eviction callbacks - one that we call synchronously
 	 * and one that we invoke via a taskq.  The async one is useful for
 	 * avoiding lock order reversals and limiting stack depth.
 	 *
 	 * Note that if we have a sync callback but no async callback,
 	 * it's likely that the sync callback will free the structure
 	 * containing the dbu.  In that case we need to take care to not
 	 * dereference dbu after calling the sync evict func.
 	 */
 	boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
 
 	if (dbu->dbu_evict_func_sync != NULL)
 		dbu->dbu_evict_func_sync(dbu);
 
 	if (has_async) {
 		taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
 		    dbu, 0, &dbu->dbu_tqent);
 	}
 }
 
 boolean_t
 dbuf_is_metadata(dmu_buf_impl_t *db)
 {
 	/*
 	 * Consider indirect blocks and spill blocks to be meta data.
 	 */
 	if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {
 		return (B_TRUE);
 	} else {
 		boolean_t is_metadata;
 
 		DB_DNODE_ENTER(db);
 		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
 		DB_DNODE_EXIT(db);
 
 		return (is_metadata);
 	}
 }
 
 /*
  * We want to exclude buffers that are on a special allocation class from
  * L2ARC.
  */
 boolean_t
 dbuf_is_l2cacheable(dmu_buf_impl_t *db)
 {
 	if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||
 	    (db->db_objset->os_secondary_cache ==
 	    ZFS_CACHE_METADATA && dbuf_is_metadata(db))) {
 		if (l2arc_exclude_special == 0)
 			return (B_TRUE);
 
 		blkptr_t *bp = db->db_blkptr;
 		if (bp == NULL || BP_IS_HOLE(bp))
 			return (B_FALSE);
 		uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
 		vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
 		vdev_t *vd = NULL;
 
 		if (vdev < rvd->vdev_children)
 			vd = rvd->vdev_child[vdev];
 
 		if (vd == NULL)
 			return (B_TRUE);
 
 		if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
 		    vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static inline boolean_t
 dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level)
 {
 	if (dn->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||
 	    (dn->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA &&
 	    (level > 0 ||
 	    DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)))) {
 		if (l2arc_exclude_special == 0)
 			return (B_TRUE);
 
 		if (bp == NULL || BP_IS_HOLE(bp))
 			return (B_FALSE);
 		uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
 		vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev;
 		vdev_t *vd = NULL;
 
 		if (vdev < rvd->vdev_children)
 			vd = rvd->vdev_child[vdev];
 
 		if (vd == NULL)
 			return (B_TRUE);
 
 		if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
 		    vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 
 /*
  * This function *must* return indices evenly distributed between all
  * sublists of the multilist. This is needed due to how the dbuf eviction
  * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
  * distributed between all sublists and uses this assumption when
  * deciding which sublist to evict from and how much to evict from it.
  */
 static unsigned int
 dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
 {
 	dmu_buf_impl_t *db = obj;
 
 	/*
 	 * The assumption here, is the hash value for a given
 	 * dmu_buf_impl_t will remain constant throughout it's lifetime
 	 * (i.e. it's objset, object, level and blkid fields don't change).
 	 * Thus, we don't need to store the dbuf's sublist index
 	 * on insertion, as this index can be recalculated on removal.
 	 *
 	 * Also, the low order bits of the hash value are thought to be
 	 * distributed evenly. Otherwise, in the case that the multilist
 	 * has a power of two number of sublists, each sublists' usage
 	 * would not be evenly distributed. In this context full 64bit
 	 * division would be a waste of time, so limit it to 32 bits.
 	 */
 	return ((unsigned int)dbuf_hash(db->db_objset, db->db.db_object,
 	    db->db_level, db->db_blkid) %
 	    multilist_get_num_sublists(ml));
 }
 
 /*
  * The target size of the dbuf cache can grow with the ARC target,
  * unless limited by the tunable dbuf_cache_max_bytes.
  */
 static inline unsigned long
 dbuf_cache_target_bytes(void)
 {
 	return (MIN(dbuf_cache_max_bytes,
 	    arc_target_bytes() >> dbuf_cache_shift));
 }
 
 /*
  * The target size of the dbuf metadata cache can grow with the ARC target,
  * unless limited by the tunable dbuf_metadata_cache_max_bytes.
  */
 static inline unsigned long
 dbuf_metadata_cache_target_bytes(void)
 {
 	return (MIN(dbuf_metadata_cache_max_bytes,
 	    arc_target_bytes() >> dbuf_metadata_cache_shift));
 }
 
 static inline uint64_t
 dbuf_cache_hiwater_bytes(void)
 {
 	uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
 	return (dbuf_cache_target +
 	    (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100);
 }
 
 static inline uint64_t
 dbuf_cache_lowater_bytes(void)
 {
 	uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
 	return (dbuf_cache_target -
 	    (dbuf_cache_target * dbuf_cache_lowater_pct) / 100);
 }
 
 static inline boolean_t
 dbuf_cache_above_lowater(void)
 {
 	return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
 	    dbuf_cache_lowater_bytes());
 }
 
 /*
  * Evict the oldest eligible dbuf from the dbuf cache.
  */
 static void
 dbuf_evict_one(void)
 {
 	int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
 	multilist_sublist_t *mls = multilist_sublist_lock(
 	    &dbuf_caches[DB_DBUF_CACHE].cache, idx);
 
 	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
 
 	dmu_buf_impl_t *db = multilist_sublist_tail(mls);
 	while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
 		db = multilist_sublist_prev(mls, db);
 	}
 
 	DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
 	    multilist_sublist_t *, mls);
 
 	if (db != NULL) {
 		multilist_sublist_remove(mls, db);
 		multilist_sublist_unlock(mls);
 		uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
 		(void) zfs_refcount_remove_many(
 		    &dbuf_caches[DB_DBUF_CACHE].size, size, db);
 		DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
 		DBUF_STAT_BUMPDOWN(cache_count);
 		DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
 		ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
 		db->db_caching_status = DB_NO_CACHE;
 		dbuf_destroy(db);
 		DBUF_STAT_BUMP(cache_total_evicts);
 	} else {
 		multilist_sublist_unlock(mls);
 	}
 }
 
 /*
  * The dbuf evict thread is responsible for aging out dbufs from the
  * cache. Once the cache has reached it's maximum size, dbufs are removed
  * and destroyed. The eviction thread will continue running until the size
  * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
  * out of the cache it is destroyed and becomes eligible for arc eviction.
  */
 static __attribute__((noreturn)) void
 dbuf_evict_thread(void *unused)
 {
 	(void) unused;
 	callb_cpr_t cpr;
 
 	CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
 
 	mutex_enter(&dbuf_evict_lock);
 	while (!dbuf_evict_thread_exit) {
 		while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
 			CALLB_CPR_SAFE_BEGIN(&cpr);
 			(void) cv_timedwait_idle_hires(&dbuf_evict_cv,
 			    &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
 			CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
 		}
 		mutex_exit(&dbuf_evict_lock);
 
 		/*
 		 * Keep evicting as long as we're above the low water mark
 		 * for the cache. We do this without holding the locks to
 		 * minimize lock contention.
 		 */
 		while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
 			dbuf_evict_one();
 		}
 
 		mutex_enter(&dbuf_evict_lock);
 	}
 
 	dbuf_evict_thread_exit = B_FALSE;
 	cv_broadcast(&dbuf_evict_cv);
 	CALLB_CPR_EXIT(&cpr);	/* drops dbuf_evict_lock */
 	thread_exit();
 }
 
 /*
  * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
  * If the dbuf cache is at its high water mark, then evict a dbuf from the
  * dbuf cache using the caller's context.
  */
 static void
 dbuf_evict_notify(uint64_t size)
 {
 	/*
 	 * We check if we should evict without holding the dbuf_evict_lock,
 	 * because it's OK to occasionally make the wrong decision here,
 	 * and grabbing the lock results in massive lock contention.
 	 */
 	if (size > dbuf_cache_target_bytes()) {
 		if (size > dbuf_cache_hiwater_bytes())
 			dbuf_evict_one();
 		cv_signal(&dbuf_evict_cv);
 	}
 }
 
 static int
 dbuf_kstat_update(kstat_t *ksp, int rw)
 {
 	dbuf_stats_t *ds = ksp->ks_data;
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 
 	if (rw == KSTAT_WRITE)
 		return (SET_ERROR(EACCES));
 
 	ds->cache_count.value.ui64 =
 	    wmsum_value(&dbuf_sums.cache_count);
 	ds->cache_size_bytes.value.ui64 =
 	    zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
 	ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
 	ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
 	ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
 	ds->cache_total_evicts.value.ui64 =
 	    wmsum_value(&dbuf_sums.cache_total_evicts);
 	for (int i = 0; i < DN_MAX_LEVELS; i++) {
 		ds->cache_levels[i].value.ui64 =
 		    wmsum_value(&dbuf_sums.cache_levels[i]);
 		ds->cache_levels_bytes[i].value.ui64 =
 		    wmsum_value(&dbuf_sums.cache_levels_bytes[i]);
 	}
 	ds->hash_hits.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_hits);
 	ds->hash_misses.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_misses);
 	ds->hash_collisions.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_collisions);
 	ds->hash_chains.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_chains);
 	ds->hash_insert_race.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_insert_race);
 	ds->hash_table_count.value.ui64 = h->hash_table_mask + 1;
 	ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1;
 	ds->metadata_cache_count.value.ui64 =
 	    wmsum_value(&dbuf_sums.metadata_cache_count);
 	ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
 	    &dbuf_caches[DB_DBUF_METADATA_CACHE].size);
 	ds->metadata_cache_overflow.value.ui64 =
 	    wmsum_value(&dbuf_sums.metadata_cache_overflow);
 	return (0);
 }
 
 void
 dbuf_init(void)
 {
 	uint64_t hmsize, hsize = 1ULL << 16;
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 
 	/*
 	 * The hash table is big enough to fill one eighth of physical memory
 	 * with an average block size of zfs_arc_average_blocksize (default 8K).
 	 * By default, the table will take up
 	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
 	 */
 	while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)
 		hsize <<= 1;
 
 	h->hash_table = NULL;
 	while (h->hash_table == NULL) {
 		h->hash_table_mask = hsize - 1;
 
 		h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
 		if (h->hash_table == NULL)
 			hsize >>= 1;
 
 		ASSERT3U(hsize, >=, 1ULL << 10);
 	}
 
 	/*
 	 * The hash table buckets are protected by an array of mutexes where
 	 * each mutex is reponsible for protecting 128 buckets.  A minimum
 	 * array size of 8192 is targeted to avoid contention.
 	 */
 	if (dbuf_mutex_cache_shift == 0)
 		hmsize = MAX(hsize >> 7, 1ULL << 13);
 	else
 		hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24);
 
 	h->hash_mutexes = NULL;
 	while (h->hash_mutexes == NULL) {
 		h->hash_mutex_mask = hmsize - 1;
 
 		h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t),
 		    KM_SLEEP);
 		if (h->hash_mutexes == NULL)
 			hmsize >>= 1;
 	}
 
 	dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
 	    sizeof (dmu_buf_impl_t),
 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 
 	for (int i = 0; i < hmsize; i++)
 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 
 	dbuf_stats_init(h);
 
 	/*
 	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
 	 * configuration is not required.
 	 */
 	dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
 
 	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
 		multilist_create(&dbuf_caches[dcs].cache,
 		    sizeof (dmu_buf_impl_t),
 		    offsetof(dmu_buf_impl_t, db_cache_link),
 		    dbuf_cache_multilist_index_func);
 		zfs_refcount_create(&dbuf_caches[dcs].size);
 	}
 
 	dbuf_evict_thread_exit = B_FALSE;
 	mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
 	dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
 	    NULL, 0, &p0, TS_RUN, minclsyspri);
 
 	wmsum_init(&dbuf_sums.cache_count, 0);
 	wmsum_init(&dbuf_sums.cache_total_evicts, 0);
 	for (int i = 0; i < DN_MAX_LEVELS; i++) {
 		wmsum_init(&dbuf_sums.cache_levels[i], 0);
 		wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0);
 	}
 	wmsum_init(&dbuf_sums.hash_hits, 0);
 	wmsum_init(&dbuf_sums.hash_misses, 0);
 	wmsum_init(&dbuf_sums.hash_collisions, 0);
 	wmsum_init(&dbuf_sums.hash_chains, 0);
 	wmsum_init(&dbuf_sums.hash_insert_race, 0);
 	wmsum_init(&dbuf_sums.metadata_cache_count, 0);
 	wmsum_init(&dbuf_sums.metadata_cache_overflow, 0);
 
 	dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 	if (dbuf_ksp != NULL) {
 		for (int i = 0; i < DN_MAX_LEVELS; i++) {
 			snprintf(dbuf_stats.cache_levels[i].name,
 			    KSTAT_STRLEN, "cache_level_%d", i);
 			dbuf_stats.cache_levels[i].data_type =
 			    KSTAT_DATA_UINT64;
 			snprintf(dbuf_stats.cache_levels_bytes[i].name,
 			    KSTAT_STRLEN, "cache_level_%d_bytes", i);
 			dbuf_stats.cache_levels_bytes[i].data_type =
 			    KSTAT_DATA_UINT64;
 		}
 		dbuf_ksp->ks_data = &dbuf_stats;
 		dbuf_ksp->ks_update = dbuf_kstat_update;
 		kstat_install(dbuf_ksp);
 	}
 }
 
 void
 dbuf_fini(void)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 
 	dbuf_stats_destroy();
 
 	for (int i = 0; i < (h->hash_mutex_mask + 1); i++)
 		mutex_destroy(&h->hash_mutexes[i]);
 
 	vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
 	vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) *
 	    sizeof (kmutex_t));
 
 	kmem_cache_destroy(dbuf_kmem_cache);
 	taskq_destroy(dbu_evict_taskq);
 
 	mutex_enter(&dbuf_evict_lock);
 	dbuf_evict_thread_exit = B_TRUE;
 	while (dbuf_evict_thread_exit) {
 		cv_signal(&dbuf_evict_cv);
 		cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
 	}
 	mutex_exit(&dbuf_evict_lock);
 
 	mutex_destroy(&dbuf_evict_lock);
 	cv_destroy(&dbuf_evict_cv);
 
 	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
 		zfs_refcount_destroy(&dbuf_caches[dcs].size);
 		multilist_destroy(&dbuf_caches[dcs].cache);
 	}
 
 	if (dbuf_ksp != NULL) {
 		kstat_delete(dbuf_ksp);
 		dbuf_ksp = NULL;
 	}
 
 	wmsum_fini(&dbuf_sums.cache_count);
 	wmsum_fini(&dbuf_sums.cache_total_evicts);
 	for (int i = 0; i < DN_MAX_LEVELS; i++) {
 		wmsum_fini(&dbuf_sums.cache_levels[i]);
 		wmsum_fini(&dbuf_sums.cache_levels_bytes[i]);
 	}
 	wmsum_fini(&dbuf_sums.hash_hits);
 	wmsum_fini(&dbuf_sums.hash_misses);
 	wmsum_fini(&dbuf_sums.hash_collisions);
 	wmsum_fini(&dbuf_sums.hash_chains);
 	wmsum_fini(&dbuf_sums.hash_insert_race);
 	wmsum_fini(&dbuf_sums.metadata_cache_count);
 	wmsum_fini(&dbuf_sums.metadata_cache_overflow);
 }
 
 /*
  * Other stuff.
  */
 
 #ifdef ZFS_DEBUG
 static void
 dbuf_verify(dmu_buf_impl_t *db)
 {
 	dnode_t *dn;
 	dbuf_dirty_record_t *dr;
 	uint32_t txg_prev;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
 		return;
 
 	ASSERT(db->db_objset != NULL);
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if (dn == NULL) {
 		ASSERT(db->db_parent == NULL);
 		ASSERT(db->db_blkptr == NULL);
 	} else {
 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
 		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 		    db->db_blkid == DMU_SPILL_BLKID ||
 		    !avl_is_empty(&dn->dn_dbufs));
 	}
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		ASSERT(dn != NULL);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
 	} else if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn != NULL);
 		ASSERT0(db->db.db_offset);
 	} else {
 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
 	}
 
 	if ((dr = list_head(&db->db_dirty_records)) != NULL) {
 		ASSERT(dr->dr_dbuf == db);
 		txg_prev = dr->dr_txg;
 		for (dr = list_next(&db->db_dirty_records, dr); dr != NULL;
 		    dr = list_next(&db->db_dirty_records, dr)) {
 			ASSERT(dr->dr_dbuf == db);
 			ASSERT(txg_prev > dr->dr_txg);
 			txg_prev = dr->dr_txg;
 		}
 	}
 
 	/*
 	 * We can't assert that db_size matches dn_datablksz because it
 	 * can be momentarily different when another thread is doing
 	 * dnode_set_blksz().
 	 */
 	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
 		dr = db->db_data_pending;
 		/*
 		 * It should only be modified in syncing context, so
 		 * make sure we only have one copy of the data.
 		 */
 		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
 	}
 
 	/* verify db->db_blkptr */
 	if (db->db_blkptr) {
 		if (db->db_parent == dn->dn_dbuf) {
 			/* db is pointed to by the dnode */
 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
 			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
 				ASSERT(db->db_parent == NULL);
 			else
 				ASSERT(db->db_parent != NULL);
 			if (db->db_blkid != DMU_SPILL_BLKID)
 				ASSERT3P(db->db_blkptr, ==,
 				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
 		} else {
 			/* db is pointed to by an indirect block */
 			int epb __maybe_unused = db->db_parent->db.db_size >>
 			    SPA_BLKPTRSHIFT;
 			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
 			ASSERT3U(db->db_parent->db.db_object, ==,
 			    db->db.db_object);
 			/*
 			 * dnode_grow_indblksz() can make this fail if we don't
 			 * have the parent's rwlock.  XXX indblksz no longer
 			 * grows.  safe to do this now?
 			 */
 			if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) {
 				ASSERT3P(db->db_blkptr, ==,
 				    ((blkptr_t *)db->db_parent->db.db_data +
 				    db->db_blkid % epb));
 			}
 		}
 	}
 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
 	    (db->db_buf == NULL || db->db_buf->b_data) &&
 	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
 	    db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {
 		/*
 		 * If the blkptr isn't set but they have nonzero data,
 		 * it had better be dirty, otherwise we'll lose that
 		 * data when we evict this buffer.
 		 *
 		 * There is an exception to this rule for indirect blocks; in
 		 * this case, if the indirect block is a hole, we fill in a few
 		 * fields on each of the child blocks (importantly, birth time)
 		 * to prevent hole birth times from being lost when you
 		 * partially fill in a hole.
 		 */
 		if (db->db_dirtycnt == 0) {
 			if (db->db_level == 0) {
 				uint64_t *buf = db->db.db_data;
 				int i;
 
 				for (i = 0; i < db->db.db_size >> 3; i++) {
 					ASSERT(buf[i] == 0);
 				}
 			} else {
 				blkptr_t *bps = db->db.db_data;
 				ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
 				    db->db.db_size);
 				/*
 				 * We want to verify that all the blkptrs in the
 				 * indirect block are holes, but we may have
 				 * automatically set up a few fields for them.
 				 * We iterate through each blkptr and verify
 				 * they only have those fields set.
 				 */
 				for (int i = 0;
 				    i < db->db.db_size / sizeof (blkptr_t);
 				    i++) {
 					blkptr_t *bp = &bps[i];
 					ASSERT(ZIO_CHECKSUM_IS_ZERO(
 					    &bp->blk_cksum));
 					ASSERT(
 					    DVA_IS_EMPTY(&bp->blk_dva[0]) &&
 					    DVA_IS_EMPTY(&bp->blk_dva[1]) &&
 					    DVA_IS_EMPTY(&bp->blk_dva[2]));
 					ASSERT0(bp->blk_fill);
 					ASSERT0(bp->blk_pad[0]);
 					ASSERT0(bp->blk_pad[1]);
 					ASSERT(!BP_IS_EMBEDDED(bp));
 					ASSERT(BP_IS_HOLE(bp));
-					ASSERT0(bp->blk_phys_birth);
+					ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
 				}
 			}
 		}
 	}
 	DB_DNODE_EXIT(db);
 }
 #endif
 
 static void
 dbuf_clear_data(dmu_buf_impl_t *db)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	dbuf_evict_user(db);
 	ASSERT3P(db->db_buf, ==, NULL);
 	db->db.db_data = NULL;
 	if (db->db_state != DB_NOFILL) {
 		db->db_state = DB_UNCACHED;
 		DTRACE_SET_STATE(db, "clear data");
 	}
 }
 
 static void
 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(buf != NULL);
 
 	db->db_buf = buf;
 	ASSERT(buf->b_data != NULL);
 	db->db.db_data = buf->b_data;
 }
 
 static arc_buf_t *
 dbuf_alloc_arcbuf(dmu_buf_impl_t *db)
 {
 	spa_t *spa = db->db_objset->os_spa;
 
 	return (arc_alloc_buf(spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size));
 }
 
 /*
  * Loan out an arc_buf for read.  Return the loaned arc_buf.
  */
 arc_buf_t *
 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 {
 	arc_buf_t *abuf;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	mutex_enter(&db->db_mtx);
 	if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) {
 		int blksz = db->db.db_size;
 		spa_t *spa = db->db_objset->os_spa;
 
 		mutex_exit(&db->db_mtx);
 		abuf = arc_loan_buf(spa, B_FALSE, blksz);
 		memcpy(abuf->b_data, db->db.db_data, blksz);
 	} else {
 		abuf = db->db_buf;
 		arc_loan_inuse_buf(abuf, db);
 		db->db_buf = NULL;
 		dbuf_clear_data(db);
 		mutex_exit(&db->db_mtx);
 	}
 	return (abuf);
 }
 
 /*
  * Calculate which level n block references the data at the level 0 offset
  * provided.
  */
 uint64_t
 dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
 {
 	if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
 		/*
 		 * The level n blkid is equal to the level 0 blkid divided by
 		 * the number of level 0s in a level n block.
 		 *
 		 * The level 0 blkid is offset >> datablkshift =
 		 * offset / 2^datablkshift.
 		 *
 		 * The number of level 0s in a level n is the number of block
 		 * pointers in an indirect block, raised to the power of level.
 		 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
 		 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
 		 *
 		 * Thus, the level n blkid is: offset /
 		 * ((2^datablkshift)*(2^(level*(indblkshift-SPA_BLKPTRSHIFT))))
 		 * = offset / 2^(datablkshift + level *
 		 *   (indblkshift - SPA_BLKPTRSHIFT))
 		 * = offset >> (datablkshift + level *
 		 *   (indblkshift - SPA_BLKPTRSHIFT))
 		 */
 
 		const unsigned exp = dn->dn_datablkshift +
 		    level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
 
 		if (exp >= 8 * sizeof (offset)) {
 			/* This only happens on the highest indirection level */
 			ASSERT3U(level, ==, dn->dn_nlevels - 1);
 			return (0);
 		}
 
 		ASSERT3U(exp, <, 8 * sizeof (offset));
 
 		return (offset >> exp);
 	} else {
 		ASSERT3U(offset, <, dn->dn_datablksz);
 		return (0);
 	}
 }
 
 /*
  * This function is used to lock the parent of the provided dbuf. This should be
  * used when modifying or reading db_blkptr.
  */
 db_lock_type_t
 dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, const void *tag)
 {
 	enum db_lock_type ret = DLT_NONE;
 	if (db->db_parent != NULL) {
 		rw_enter(&db->db_parent->db_rwlock, rw);
 		ret = DLT_PARENT;
 	} else if (dmu_objset_ds(db->db_objset) != NULL) {
 		rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw,
 		    tag);
 		ret = DLT_OBJSET;
 	}
 	/*
 	 * We only return a DLT_NONE lock when it's the top-most indirect block
 	 * of the meta-dnode of the MOS.
 	 */
 	return (ret);
 }
 
 /*
  * We need to pass the lock type in because it's possible that the block will
  * move from being the topmost indirect block in a dnode (and thus, have no
  * parent) to not the top-most via an indirection increase. This would cause a
  * panic if we didn't pass the lock type in.
  */
 void
 dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, const void *tag)
 {
 	if (type == DLT_PARENT)
 		rw_exit(&db->db_parent->db_rwlock);
 	else if (type == DLT_OBJSET)
 		rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag);
 }
 
 static void
 dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *vdb)
 {
 	(void) zb, (void) bp;
 	dmu_buf_impl_t *db = vdb;
 
 	mutex_enter(&db->db_mtx);
 	ASSERT3U(db->db_state, ==, DB_READ);
 	/*
 	 * All reads are synchronous, so we must have a hold on the dbuf
 	 */
 	ASSERT(zfs_refcount_count(&db->db_holds) > 0);
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db.db_data == NULL);
 	if (buf == NULL) {
 		/* i/o error */
 		ASSERT(zio == NULL || zio->io_error != 0);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT3P(db->db_buf, ==, NULL);
 		db->db_state = DB_UNCACHED;
 		DTRACE_SET_STATE(db, "i/o error");
 	} else if (db->db_level == 0 && db->db_freed_in_flight) {
 		/* freed in flight */
 		ASSERT(zio == NULL || zio->io_error == 0);
 		arc_release(buf, db);
 		memset(buf->b_data, 0, db->db.db_size);
 		arc_buf_freeze(buf);
 		db->db_freed_in_flight = FALSE;
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 		DTRACE_SET_STATE(db, "freed in flight");
 	} else {
 		/* success */
 		ASSERT(zio == NULL || zio->io_error == 0);
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 		DTRACE_SET_STATE(db, "successful read");
 	}
 	cv_broadcast(&db->db_changed);
 	dbuf_rele_and_unlock(db, NULL, B_FALSE);
 }
 
 /*
  * Shortcut for performing reads on bonus dbufs.  Returns
  * an error if we fail to verify the dnode associated with
  * a decrypted block. Otherwise success.
  */
 static int
 dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
 {
 	int bonuslen, max_bonuslen, err;
 
 	err = dbuf_read_verify_dnode_crypt(db, flags);
 	if (err)
 		return (err);
 
 	bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
 	max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(DB_DNODE_HELD(db));
 	ASSERT3U(bonuslen, <=, db->db.db_size);
 	db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
 	arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
 	if (bonuslen < max_bonuslen)
 		memset(db->db.db_data, 0, max_bonuslen);
 	if (bonuslen)
 		memcpy(db->db.db_data, DN_BONUS(dn->dn_phys), bonuslen);
 	db->db_state = DB_CACHED;
 	DTRACE_SET_STATE(db, "bonus buffer filled");
 	return (0);
 }
 
 static void
 dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
 {
 	blkptr_t *bps = db->db.db_data;
 	uint32_t indbs = 1ULL << dn->dn_indblkshift;
 	int n_bps = indbs >> SPA_BLKPTRSHIFT;
 
 	for (int i = 0; i < n_bps; i++) {
 		blkptr_t *bp = &bps[i];
 
 		ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);
 		BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?
 		    dn->dn_datablksz : BP_GET_LSIZE(dbbp));
 		BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
 		BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
-		BP_SET_BIRTH(bp, dbbp->blk_birth, 0);
+		BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0);
 	}
 }
 
 /*
  * Handle reads on dbufs that are holes, if necessary.  This function
  * requires that the dbuf's mutex is held. Returns success (0) if action
  * was taken, ENOENT if no action was taken.
  */
 static int
 dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	int is_hole = bp == NULL || BP_IS_HOLE(bp);
 	/*
 	 * For level 0 blocks only, if the above check fails:
 	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
 	 * processes the delete record and clears the bp while we are waiting
 	 * for the dn_mtx (resulting in a "no" from block_freed).
 	 */
 	if (!is_hole && db->db_level == 0)
 		is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
 
 	if (is_hole) {
 		dbuf_set_data(db, dbuf_alloc_arcbuf(db));
 		memset(db->db.db_data, 0, db->db.db_size);
 
 		if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
-		    bp->blk_birth != 0) {
+		    BP_GET_LOGICAL_BIRTH(bp) != 0) {
 			dbuf_handle_indirect_hole(db, dn, bp);
 		}
 		db->db_state = DB_CACHED;
 		DTRACE_SET_STATE(db, "hole read satisfied");
 		return (0);
 	}
 	return (ENOENT);
 }
 
 /*
  * This function ensures that, when doing a decrypting read of a block,
  * we make sure we have decrypted the dnode associated with it. We must do
  * this so that we ensure we are fully authenticating the checksum-of-MACs
  * tree from the root of the objset down to this block. Indirect blocks are
  * always verified against their secure checksum-of-MACs assuming that the
  * dnode containing them is correct. Now that we are doing a decrypting read,
  * we can be sure that the key is loaded and verify that assumption. This is
  * especially important considering that we always read encrypted dnode
  * blocks as raw data (without verifying their MACs) to start, and
  * decrypt / authenticate them when we need to read an encrypted bonus buffer.
  */
 static int
 dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
 {
 	int err = 0;
 	objset_t *os = db->db_objset;
 	arc_buf_t *dnode_abuf;
 	dnode_t *dn;
 	zbookmark_phys_t zb;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if ((flags & DB_RF_NO_DECRYPT) != 0 ||
 	    !os->os_encrypted || os->os_raw_receive)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL;
 
 	if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) {
 		DB_DNODE_EXIT(db);
 		return (0);
 	}
 
 	SET_BOOKMARK(&zb, dmu_objset_id(os),
 	    DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid);
 	err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE);
 
 	/*
 	 * An error code of EACCES tells us that the key is still not
 	 * available. This is ok if we are only reading authenticated
 	 * (and therefore non-encrypted) blocks.
 	 */
 	if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID &&
 	    !DMU_OT_IS_ENCRYPTED(dn->dn_type)) ||
 	    (db->db_blkid == DMU_BONUS_BLKID &&
 	    !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
 		err = 0;
 
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Drops db_mtx and the parent lock specified by dblt and tag before
  * returning.
  */
 static int
 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
     db_lock_type_t dblt, const void *tag)
 {
 	dnode_t *dn;
 	zbookmark_phys_t zb;
 	uint32_t aflags = ARC_FLAG_NOWAIT;
 	int err, zio_flags;
 	blkptr_t bp, *bpp;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db_parent == NULL ||
 	    RW_LOCK_HELD(&db->db_parent->db_rwlock));
 
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		err = dbuf_read_bonus(db, dn, flags);
 		goto early_unlock;
 	}
 
 	if (db->db_state == DB_UNCACHED) {
 		if (db->db_blkptr == NULL) {
 			bpp = NULL;
 		} else {
 			bp = *db->db_blkptr;
 			bpp = &bp;
 		}
 	} else {
 		dbuf_dirty_record_t *dr;
 
 		ASSERT3S(db->db_state, ==, DB_NOFILL);
 
 		/*
 		 * Block cloning: If we have a pending block clone,
 		 * we don't want to read the underlying block, but the content
 		 * of the block being cloned, so we have the most recent data.
 		 */
 		dr = list_head(&db->db_dirty_records);
 		if (dr == NULL || !dr->dt.dl.dr_brtwrite) {
 			err = EIO;
 			goto early_unlock;
 		}
 		bp = dr->dt.dl.dr_overridden_by;
 		bpp = &bp;
 	}
 
 	err = dbuf_read_hole(db, dn, bpp);
 	if (err == 0)
 		goto early_unlock;
 
 	ASSERT(bpp != NULL);
 
 	/*
 	 * Any attempt to read a redacted block should result in an error. This
 	 * will never happen under normal conditions, but can be useful for
 	 * debugging purposes.
 	 */
 	if (BP_IS_REDACTED(bpp)) {
 		ASSERT(dsl_dataset_feature_is_active(
 		    db->db_objset->os_dsl_dataset,
 		    SPA_FEATURE_REDACTED_DATASETS));
 		err = SET_ERROR(EIO);
 		goto early_unlock;
 	}
 
 	SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	/*
 	 * All bps of an encrypted os should have the encryption bit set.
 	 * If this is not true it indicates tampering and we report an error.
 	 */
 	if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
-		spa_log_error(db->db_objset->os_spa, &zb, &bpp->blk_birth);
+		spa_log_error(db->db_objset->os_spa, &zb,
+		    BP_GET_LOGICAL_BIRTH(bpp));
 		err = SET_ERROR(EIO);
 		goto early_unlock;
 	}
 
 	err = dbuf_read_verify_dnode_crypt(db, flags);
 	if (err != 0)
 		goto early_unlock;
 
 	DB_DNODE_EXIT(db);
 
 	db->db_state = DB_READ;
 	DTRACE_SET_STATE(db, "read issued");
 	mutex_exit(&db->db_mtx);
 
 	if (!DBUF_IS_CACHEABLE(db))
 		aflags |= ARC_FLAG_UNCACHED;
 	else if (dbuf_is_l2cacheable(db))
 		aflags |= ARC_FLAG_L2CACHE;
 
 	dbuf_add_ref(db, NULL);
 
 	zio_flags = (flags & DB_RF_CANFAIL) ?
 	    ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
 
 	if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
 		zio_flags |= ZIO_FLAG_RAW;
 	/*
 	 * The zio layer will copy the provided blkptr later, but we have our
 	 * own copy so that we can release the parent's rwlock. We have to
 	 * do that so that if dbuf_read_done is called synchronously (on
 	 * an l1 cache hit) we don't acquire the db_mtx while holding the
 	 * parent's rwlock, which would be a lock ordering violation.
 	 */
 	dmu_buf_unlock_parent(db, dblt, tag);
 	(void) arc_read(zio, db->db_objset->os_spa, bpp,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
 	    &aflags, &zb);
 	return (err);
 early_unlock:
 	DB_DNODE_EXIT(db);
 	mutex_exit(&db->db_mtx);
 	dmu_buf_unlock_parent(db, dblt, tag);
 	return (err);
 }
 
 /*
  * This is our just-in-time copy function.  It makes a copy of buffers that
  * have been modified in a previous transaction group before we access them in
  * the current active group.
  *
  * This function is used in three places: when we are dirtying a buffer for the
  * first time in a txg, when we are freeing a range in a dnode that includes
  * this buffer, and when we are accessing a buffer which was received compressed
  * and later referenced in a WRITE_BYREF record.
  *
  * Note that when we are called from dbuf_free_range() we do not put a hold on
  * the buffer, we just traverse the active dbuf list for the dnode.
  */
 static void
 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 {
 	dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db.db_data != NULL);
 	ASSERT(db->db_level == 0);
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
 
 	if (dr == NULL ||
 	    (dr->dt.dl.dr_data !=
 	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
 		return;
 
 	/*
 	 * If the last dirty record for this dbuf has not yet synced
 	 * and its referencing the dbuf data, either:
 	 *	reset the reference to point to a new copy,
 	 * or (if there a no active holders)
 	 *	just null out the current db_data pointer.
 	 */
 	ASSERT3U(dr->dr_txg, >=, txg - 2);
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		dnode_t *dn = DB_DNODE(db);
 		int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
 		dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
 		arc_space_consume(bonuslen, ARC_SPACE_BONUS);
 		memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);
 	} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
 		dnode_t *dn = DB_DNODE(db);
 		int size = arc_buf_size(db->db_buf);
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		spa_t *spa = db->db_objset->os_spa;
 		enum zio_compress compress_type =
 		    arc_get_compression(db->db_buf);
 		uint8_t complevel = arc_get_complevel(db->db_buf);
 
 		if (arc_is_encrypted(db->db_buf)) {
 			boolean_t byteorder;
 			uint8_t salt[ZIO_DATA_SALT_LEN];
 			uint8_t iv[ZIO_DATA_IV_LEN];
 			uint8_t mac[ZIO_DATA_MAC_LEN];
 
 			arc_get_raw_params(db->db_buf, &byteorder, salt,
 			    iv, mac);
 			dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,
 			    dmu_objset_id(dn->dn_objset), byteorder, salt, iv,
 			    mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),
 			    compress_type, complevel);
 		} else if (compress_type != ZIO_COMPRESS_OFF) {
 			ASSERT3U(type, ==, ARC_BUFC_DATA);
 			dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
 			    size, arc_buf_lsize(db->db_buf), compress_type,
 			    complevel);
 		} else {
 			dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
 		}
 		memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);
 	} else {
 		db->db_buf = NULL;
 		dbuf_clear_data(db);
 	}
 }
 
 int
 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 {
 	int err = 0;
 	boolean_t prefetch;
 	dnode_t *dn;
 
 	/*
 	 * We don't have to hold the mutex to check db_state because it
 	 * can't be freed while we have a hold on the buffer.
 	 */
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL;
 
 	mutex_enter(&db->db_mtx);
 	if (flags & DB_RF_PARTIAL_FIRST)
 		db->db_partial_read = B_TRUE;
 	else if (!(flags & DB_RF_PARTIAL_MORE))
 		db->db_partial_read = B_FALSE;
 	if (db->db_state == DB_CACHED) {
 		/*
 		 * Ensure that this block's dnode has been decrypted if
 		 * the caller has requested decrypted data.
 		 */
 		err = dbuf_read_verify_dnode_crypt(db, flags);
 
 		/*
 		 * If the arc buf is compressed or encrypted and the caller
 		 * requested uncompressed data, we need to untransform it
 		 * before returning. We also call arc_untransform() on any
 		 * unauthenticated blocks, which will verify their MAC if
 		 * the key is now available.
 		 */
 		if (err == 0 && db->db_buf != NULL &&
 		    (flags & DB_RF_NO_DECRYPT) == 0 &&
 		    (arc_is_encrypted(db->db_buf) ||
 		    arc_is_unauthenticated(db->db_buf) ||
 		    arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
 			spa_t *spa = dn->dn_objset->os_spa;
 			zbookmark_phys_t zb;
 
 			SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
 			    db->db.db_object, db->db_level, db->db_blkid);
 			dbuf_fix_old_data(db, spa_syncing_txg(spa));
 			err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
 			dbuf_set_data(db, db->db_buf);
 		}
 		mutex_exit(&db->db_mtx);
 		if (err == 0 && prefetch) {
 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
 			    B_FALSE, flags & DB_RF_HAVESTRUCT);
 		}
 		DB_DNODE_EXIT(db);
 		DBUF_STAT_BUMP(hash_hits);
 	} else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
 		boolean_t need_wait = B_FALSE;
 
 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
 
 		if (zio == NULL && (db->db_state == DB_NOFILL ||
 		    (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
 			spa_t *spa = dn->dn_objset->os_spa;
 			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 			need_wait = B_TRUE;
 		}
 		err = dbuf_read_impl(db, zio, flags, dblt, FTAG);
 		/*
 		 * dbuf_read_impl has dropped db_mtx and our parent's rwlock
 		 * for us
 		 */
 		if (!err && prefetch) {
 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
 			    db->db_state != DB_CACHED,
 			    flags & DB_RF_HAVESTRUCT);
 		}
 
 		DB_DNODE_EXIT(db);
 		DBUF_STAT_BUMP(hash_misses);
 
 		/*
 		 * If we created a zio_root we must execute it to avoid
 		 * leaking it, even if it isn't attached to any work due
 		 * to an error in dbuf_read_impl().
 		 */
 		if (need_wait) {
 			if (err == 0)
 				err = zio_wait(zio);
 			else
 				VERIFY0(zio_wait(zio));
 		}
 	} else {
 		/*
 		 * Another reader came in while the dbuf was in flight
 		 * between UNCACHED and CACHED.  Either a writer will finish
 		 * writing the buffer (sending the dbuf to CACHED) or the
 		 * first reader's request will reach the read_done callback
 		 * and send the dbuf to CACHED.  Otherwise, a failure
 		 * occurred and the dbuf went to UNCACHED.
 		 */
 		mutex_exit(&db->db_mtx);
 		if (prefetch) {
 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
 			    B_TRUE, flags & DB_RF_HAVESTRUCT);
 		}
 		DB_DNODE_EXIT(db);
 		DBUF_STAT_BUMP(hash_misses);
 
 		/* Skip the wait per the caller's request. */
 		if ((flags & DB_RF_NEVERWAIT) == 0) {
 			mutex_enter(&db->db_mtx);
 			while (db->db_state == DB_READ ||
 			    db->db_state == DB_FILL) {
 				ASSERT(db->db_state == DB_READ ||
 				    (flags & DB_RF_HAVESTRUCT) == 0);
 				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
 				    db, zio_t *, zio);
 				cv_wait(&db->db_changed, &db->db_mtx);
 			}
 			if (db->db_state == DB_UNCACHED)
 				err = SET_ERROR(EIO);
 			mutex_exit(&db->db_mtx);
 		}
 	}
 
 	return (err);
 }
 
 static void
 dbuf_noread(dmu_buf_impl_t *db)
 {
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	mutex_enter(&db->db_mtx);
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 	if (db->db_state == DB_UNCACHED) {
 		ASSERT(db->db_buf == NULL);
 		ASSERT(db->db.db_data == NULL);
 		dbuf_set_data(db, dbuf_alloc_arcbuf(db));
 		db->db_state = DB_FILL;
 		DTRACE_SET_STATE(db, "assigning filled buffer");
 	} else if (db->db_state == DB_NOFILL) {
 		dbuf_clear_data(db);
 	} else {
 		ASSERT3U(db->db_state, ==, DB_CACHED);
 	}
 	mutex_exit(&db->db_mtx);
 }
 
 void
 dbuf_unoverride(dbuf_dirty_record_t *dr)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
 	uint64_t txg = dr->dr_txg;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	/*
 	 * This assert is valid because dmu_sync() expects to be called by
 	 * a zilog's get_data while holding a range lock.  This call only
 	 * comes from dbuf_dirty() callers who must also hold a range lock.
 	 */
 	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
 	ASSERT(db->db_level == 0);
 
 	if (db->db_blkid == DMU_BONUS_BLKID ||
 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
 		return;
 
 	ASSERT(db->db_data_pending != dr);
 
 	/* free this block */
 	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
 		zio_free(db->db_objset->os_spa, txg, bp);
 
 	if (dr->dt.dl.dr_brtwrite) {
 		ASSERT0P(dr->dt.dl.dr_data);
 		dr->dt.dl.dr_data = db->db_buf;
 	}
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	dr->dt.dl.dr_nopwrite = B_FALSE;
 	dr->dt.dl.dr_brtwrite = B_FALSE;
 	dr->dt.dl.dr_has_raw_params = B_FALSE;
 
 	/*
 	 * Release the already-written buffer, so we leave it in
 	 * a consistent dirty state.  Note that all callers are
 	 * modifying the buffer, so they will immediately do
 	 * another (redundant) arc_release().  Therefore, leave
 	 * the buf thawed to save the effort of freezing &
 	 * immediately re-thawing it.
 	 */
 	if (dr->dt.dl.dr_data)
 		arc_release(dr->dt.dl.dr_data, db);
 }
 
 /*
  * Evict (if its unreferenced) or clear (if its referenced) any level-0
  * data blocks in the free range, so that any future readers will find
  * empty blocks.
  */
 void
 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db_search;
 	dmu_buf_impl_t *db, *db_next;
 	uint64_t txg = tx->tx_txg;
 	avl_index_t where;
 	dbuf_dirty_record_t *dr;
 
 	if (end_blkid > dn->dn_maxblkid &&
 	    !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
 		end_blkid = dn->dn_maxblkid;
 	dprintf_dnode(dn, "start=%llu end=%llu\n", (u_longlong_t)start_blkid,
 	    (u_longlong_t)end_blkid);
 
 	db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
 	db_search->db_level = 0;
 	db_search->db_blkid = start_blkid;
 	db_search->db_state = DB_SEARCH;
 
 	mutex_enter(&dn->dn_dbufs_mtx);
 	db = avl_find(&dn->dn_dbufs, db_search, &where);
 	ASSERT3P(db, ==, NULL);
 
 	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
 
 	for (; db != NULL; db = db_next) {
 		db_next = AVL_NEXT(&dn->dn_dbufs, db);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 		if (db->db_level != 0 || db->db_blkid > end_blkid) {
 			break;
 		}
 		ASSERT3U(db->db_blkid, >=, start_blkid);
 
 		/* found a level 0 buffer in the range */
 		mutex_enter(&db->db_mtx);
 		if (dbuf_undirty(db, tx)) {
 			/* mutex has been dropped and dbuf destroyed */
 			continue;
 		}
 
 		if (db->db_state == DB_UNCACHED ||
 		    db->db_state == DB_NOFILL ||
 		    db->db_state == DB_EVICTING) {
 			ASSERT(db->db.db_data == NULL);
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
 		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
 			/* will be handled in dbuf_read_done or dbuf_rele */
 			db->db_freed_in_flight = TRUE;
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
 		if (zfs_refcount_count(&db->db_holds) == 0) {
 			ASSERT(db->db_buf);
 			dbuf_destroy(db);
 			continue;
 		}
 		/* The dbuf is referenced */
 
 		dr = list_head(&db->db_dirty_records);
 		if (dr != NULL) {
 			if (dr->dr_txg == txg) {
 				/*
 				 * This buffer is "in-use", re-adjust the file
 				 * size to reflect that this buffer may
 				 * contain new data when we sync.
 				 */
 				if (db->db_blkid != DMU_SPILL_BLKID &&
 				    db->db_blkid > dn->dn_maxblkid)
 					dn->dn_maxblkid = db->db_blkid;
 				dbuf_unoverride(dr);
 			} else {
 				/*
 				 * This dbuf is not dirty in the open context.
 				 * Either uncache it (if its not referenced in
 				 * the open context) or reset its contents to
 				 * empty.
 				 */
 				dbuf_fix_old_data(db, txg);
 			}
 		}
 		/* clear the contents if its cached */
 		if (db->db_state == DB_CACHED) {
 			ASSERT(db->db.db_data != NULL);
 			arc_release(db->db_buf, db);
 			rw_enter(&db->db_rwlock, RW_WRITER);
 			memset(db->db.db_data, 0, db->db.db_size);
 			rw_exit(&db->db_rwlock);
 			arc_buf_freeze(db->db_buf);
 		}
 
 		mutex_exit(&db->db_mtx);
 	}
 
 	mutex_exit(&dn->dn_dbufs_mtx);
 	kmem_free(db_search, sizeof (dmu_buf_impl_t));
 }
 
 void
 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 {
 	arc_buf_t *buf, *old_buf;
 	dbuf_dirty_record_t *dr;
 	int osize = db->db.db_size;
 	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 	dnode_t *dn;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	/*
 	 * XXX we should be doing a dbuf_read, checking the return
 	 * value and returning that up to our callers
 	 */
 	dmu_buf_will_dirty(&db->db, tx);
 
 	/* create the data buffer for the new block */
 	buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
 
 	/* copy old block data to the new block */
 	old_buf = db->db_buf;
 	memcpy(buf->b_data, old_buf->b_data, MIN(osize, size));
 	/* zero the remainder */
 	if (size > osize)
 		memset((uint8_t *)buf->b_data + osize, 0, size - osize);
 
 	mutex_enter(&db->db_mtx);
 	dbuf_set_data(db, buf);
 	arc_buf_destroy(old_buf, db);
 	db->db.db_size = size;
 
 	dr = list_head(&db->db_dirty_records);
 	/* dirty record added by dmu_buf_will_dirty() */
 	VERIFY(dr != NULL);
 	if (db->db_level == 0)
 		dr->dt.dl.dr_data = buf;
 	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 	ASSERT3U(dr->dr_accounted, ==, osize);
 	dr->dr_accounted = size;
 	mutex_exit(&db->db_mtx);
 
 	dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
 	DB_DNODE_EXIT(db);
 }
 
 void
 dbuf_release_bp(dmu_buf_impl_t *db)
 {
 	objset_t *os __maybe_unused = db->db_objset;
 
 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
 	ASSERT(arc_released(os->os_phys_buf) ||
 	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
 	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
 
 	(void) arc_release(db->db_buf, db);
 }
 
 /*
  * We already have a dirty record for this TXG, and we are being
  * dirtied again.
  */
 static void
 dbuf_redirty(dbuf_dirty_record_t *dr)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
 		/*
 		 * If this buffer has already been written out,
 		 * we now need to reset its state.
 		 */
 		dbuf_unoverride(dr);
 		if (db->db.db_object != DMU_META_DNODE_OBJECT &&
 		    db->db_state != DB_NOFILL) {
 			/* Already released on initial dirty, so just thaw. */
 			ASSERT(arc_released(db->db_buf));
 			arc_buf_thaw(db->db_buf);
 		}
 	}
 }
 
 dbuf_dirty_record_t *
 dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
 {
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid);
 	dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE);
 	ASSERT(dn->dn_maxblkid >= blkid);
 
 	dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP);
 	list_link_init(&dr->dr_dirty_node);
 	list_link_init(&dr->dr_dbuf_node);
 	dr->dr_dnode = dn;
 	dr->dr_txg = tx->tx_txg;
 	dr->dt.dll.dr_blkid = blkid;
 	dr->dr_accounted = dn->dn_datablksz;
 
 	/*
 	 * There should not be any dbuf for the block that we're dirtying.
 	 * Otherwise the buffer contents could be inconsistent between the
 	 * dbuf and the lightweight dirty record.
 	 */
 	ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid,
 	    NULL));
 
 	mutex_enter(&dn->dn_mtx);
 	int txgoff = tx->tx_txg & TXG_MASK;
 	if (dn->dn_free_ranges[txgoff] != NULL) {
 		range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1);
 	}
 
 	if (dn->dn_nlevels == 1) {
 		ASSERT3U(blkid, <, dn->dn_nblkptr);
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_setdirty(dn, tx);
 	} else {
 		mutex_exit(&dn->dn_mtx);
 
 		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 		dmu_buf_impl_t *parent_db = dbuf_hold_level(dn,
 		    1, blkid >> epbs, FTAG);
 		rw_exit(&dn->dn_struct_rwlock);
 		if (parent_db == NULL) {
 			kmem_free(dr, sizeof (*dr));
 			return (NULL);
 		}
 		int err = dbuf_read(parent_db, NULL,
 		    (DB_RF_NOPREFETCH | DB_RF_CANFAIL));
 		if (err != 0) {
 			dbuf_rele(parent_db, FTAG);
 			kmem_free(dr, sizeof (*dr));
 			return (NULL);
 		}
 
 		dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx);
 		dbuf_rele(parent_db, FTAG);
 		mutex_enter(&parent_dr->dt.di.dr_mtx);
 		ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg);
 		list_insert_tail(&parent_dr->dt.di.dr_children, dr);
 		mutex_exit(&parent_dr->dt.di.dr_mtx);
 		dr->dr_parent = parent_dr;
 	}
 
 	dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx);
 
 	return (dr);
 }
 
 dbuf_dirty_record_t *
 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	objset_t *os;
 	dbuf_dirty_record_t *dr, *dr_next, *dr_head;
 	int txgoff = tx->tx_txg & TXG_MASK;
 	boolean_t drop_struct_rwlock = B_FALSE;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	DMU_TX_DIRTY_BUF(tx, db);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	/*
 	 * Shouldn't dirty a regular buffer in syncing context.  Private
 	 * objects may be dirtied in syncing context, but only if they
 	 * were already pre-dirtied in open context.
 	 */
 #ifdef ZFS_DEBUG
 	if (dn->dn_objset->os_dsl_dataset != NULL) {
 		rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
 		    RW_READER, FTAG);
 	}
 	ASSERT(!dmu_tx_is_syncing(tx) ||
 	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
 	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    dn->dn_objset->os_dsl_dataset == NULL);
 	if (dn->dn_objset->os_dsl_dataset != NULL)
 		rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
 #endif
 	/*
 	 * We make this assert for private objects as well, but after we
 	 * check if we're already dirty.  They are allowed to re-dirty
 	 * in syncing context.
 	 */
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	mutex_enter(&db->db_mtx);
 	/*
 	 * XXX make this true for indirects too?  The problem is that
 	 * transactions created with dmu_tx_create_assigned() from
 	 * syncing context don't bother holding ahead.
 	 */
 	ASSERT(db->db_level != 0 ||
 	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
 	    db->db_state == DB_NOFILL);
 
 	mutex_enter(&dn->dn_mtx);
 	dnode_set_dirtyctx(dn, tx, db);
 	if (tx->tx_txg > dn->dn_dirty_txg)
 		dn->dn_dirty_txg = tx->tx_txg;
 	mutex_exit(&dn->dn_mtx);
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		dn->dn_have_spill = B_TRUE;
 
 	/*
 	 * If this buffer is already dirty, we're done.
 	 */
 	dr_head = list_head(&db->db_dirty_records);
 	ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg ||
 	    db->db.db_object == DMU_META_DNODE_OBJECT);
 	dr_next = dbuf_find_dirty_lte(db, tx->tx_txg);
 	if (dr_next && dr_next->dr_txg == tx->tx_txg) {
 		DB_DNODE_EXIT(db);
 
 		dbuf_redirty(dr_next);
 		mutex_exit(&db->db_mtx);
 		return (dr_next);
 	}
 
 	/*
 	 * Only valid if not already dirty.
 	 */
 	ASSERT(dn->dn_object == 0 ||
 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
 
 	/*
 	 * We should only be dirtying in syncing context if it's the
 	 * mos or we're initializing the os or it's a special object.
 	 * However, we are allowed to dirty in syncing context provided
 	 * we already dirtied it in open context.  Hence we must make
 	 * this assertion only if we're not already dirty.
 	 */
 	os = dn->dn_objset;
 	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
 #ifdef ZFS_DEBUG
 	if (dn->dn_objset->os_dsl_dataset != NULL)
 		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
 	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
 	if (dn->dn_objset->os_dsl_dataset != NULL)
 		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
 #endif
 	ASSERT(db->db.db_size != 0);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
 	if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
 		dmu_objset_willuse_space(os, db->db.db_size, tx);
 	}
 
 	/*
 	 * If this buffer is dirty in an old transaction group we need
 	 * to make a copy of it so that the changes we make in this
 	 * transaction group won't leak out when we sync the older txg.
 	 */
 	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
 	list_link_init(&dr->dr_dirty_node);
 	list_link_init(&dr->dr_dbuf_node);
 	dr->dr_dnode = dn;
 	if (db->db_level == 0) {
 		void *data_old = db->db_buf;
 
 		if (db->db_state != DB_NOFILL) {
 			if (db->db_blkid == DMU_BONUS_BLKID) {
 				dbuf_fix_old_data(db, tx->tx_txg);
 				data_old = db->db.db_data;
 			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
 				/*
 				 * Release the data buffer from the cache so
 				 * that we can modify it without impacting
 				 * possible other users of this cached data
 				 * block.  Note that indirect blocks and
 				 * private objects are not released until the
 				 * syncing state (since they are only modified
 				 * then).
 				 */
 				arc_release(db->db_buf, db);
 				dbuf_fix_old_data(db, tx->tx_txg);
 				data_old = db->db_buf;
 			}
 			ASSERT(data_old != NULL);
 		}
 		dr->dt.dl.dr_data = data_old;
 	} else {
 		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL);
 		list_create(&dr->dt.di.dr_children,
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
 	if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
 		dr->dr_accounted = db->db.db_size;
 	}
 	dr->dr_dbuf = db;
 	dr->dr_txg = tx->tx_txg;
 	list_insert_before(&db->db_dirty_records, dr_next, dr);
 
 	/*
 	 * We could have been freed_in_flight between the dbuf_noread
 	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
 	 * happened after the free.
 	 */
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    db->db_blkid != DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (dn->dn_free_ranges[txgoff] != NULL) {
 			range_tree_clear(dn->dn_free_ranges[txgoff],
 			    db->db_blkid, 1);
 		}
 		mutex_exit(&dn->dn_mtx);
 		db->db_freed_in_flight = FALSE;
 	}
 
 	/*
 	 * This buffer is now part of this txg
 	 */
 	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
 	db->db_dirtycnt += 1;
 	ASSERT3U(db->db_dirtycnt, <=, 3);
 
 	mutex_exit(&db->db_mtx);
 
 	if (db->db_blkid == DMU_BONUS_BLKID ||
 	    db->db_blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		ASSERT(!list_link_active(&dr->dr_dirty_node));
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		dnode_setdirty(dn, tx);
 		DB_DNODE_EXIT(db);
 		return (dr);
 	}
 
 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		drop_struct_rwlock = B_TRUE;
 	}
 
 	/*
 	 * If we are overwriting a dedup BP, then unless it is snapshotted,
 	 * when we get to syncing context we will need to decrement its
 	 * refcount in the DDT.  Prefetch the relevant DDT block so that
 	 * syncing context won't have to wait for the i/o.
 	 */
 	if (db->db_blkptr != NULL) {
 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
 		ddt_prefetch(os->os_spa, db->db_blkptr);
 		dmu_buf_unlock_parent(db, dblt, FTAG);
 	}
 
 	/*
 	 * We need to hold the dn_struct_rwlock to make this assertion,
 	 * because it protects dn_phys / dn_next_nlevels from changing.
 	 */
 	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
 	    dn->dn_phys->dn_nlevels > db->db_level ||
 	    dn->dn_next_nlevels[txgoff] > db->db_level ||
 	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
 	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
 
 
 	if (db->db_level == 0) {
 		ASSERT(!db->db_objset->os_raw_receive ||
 		    dn->dn_maxblkid >= db->db_blkid);
 		dnode_new_blkid(dn, db->db_blkid, tx,
 		    drop_struct_rwlock, B_FALSE);
 		ASSERT(dn->dn_maxblkid >= db->db_blkid);
 	}
 
 	if (db->db_level+1 < dn->dn_nlevels) {
 		dmu_buf_impl_t *parent = db->db_parent;
 		dbuf_dirty_record_t *di;
 		int parent_held = FALSE;
 
 		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 			parent = dbuf_hold_level(dn, db->db_level + 1,
 			    db->db_blkid >> epbs, FTAG);
 			ASSERT(parent != NULL);
 			parent_held = TRUE;
 		}
 		if (drop_struct_rwlock)
 			rw_exit(&dn->dn_struct_rwlock);
 		ASSERT3U(db->db_level + 1, ==, parent->db_level);
 		di = dbuf_dirty(parent, tx);
 		if (parent_held)
 			dbuf_rele(parent, FTAG);
 
 		mutex_enter(&db->db_mtx);
 		/*
 		 * Since we've dropped the mutex, it's possible that
 		 * dbuf_undirty() might have changed this out from under us.
 		 */
 		if (list_head(&db->db_dirty_records) == dr ||
 		    dn->dn_object == DMU_META_DNODE_OBJECT) {
 			mutex_enter(&di->dt.di.dr_mtx);
 			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
 			ASSERT(!list_link_active(&dr->dr_dirty_node));
 			list_insert_tail(&di->dt.di.dr_children, dr);
 			mutex_exit(&di->dt.di.dr_mtx);
 			dr->dr_parent = di;
 		}
 		mutex_exit(&db->db_mtx);
 	} else {
 		ASSERT(db->db_level + 1 == dn->dn_nlevels);
 		ASSERT(db->db_blkid < dn->dn_nblkptr);
 		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		ASSERT(!list_link_active(&dr->dr_dirty_node));
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		if (drop_struct_rwlock)
 			rw_exit(&dn->dn_struct_rwlock);
 	}
 
 	dnode_setdirty(dn, tx);
 	DB_DNODE_EXIT(db);
 	return (dr);
 }
 
 static void
 dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	if (dr->dt.dl.dr_data != db->db.db_data) {
 		struct dnode *dn = dr->dr_dnode;
 		int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
 
 		kmem_free(dr->dt.dl.dr_data, max_bonuslen);
 		arc_space_return(max_bonuslen, ARC_SPACE_BONUS);
 	}
 	db->db_data_pending = NULL;
 	ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
 	list_remove(&db->db_dirty_records, dr);
 	if (dr->dr_dbuf->db_level != 0) {
 		mutex_destroy(&dr->dt.di.dr_mtx);
 		list_destroy(&dr->dt.di.dr_children);
 	}
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 	ASSERT3U(db->db_dirtycnt, >, 0);
 	db->db_dirtycnt -= 1;
 }
 
 /*
  * Undirty a buffer in the transaction group referenced by the given
  * transaction.  Return whether this evicted the dbuf.
  */
 boolean_t
 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	uint64_t txg = tx->tx_txg;
 	boolean_t brtwrite;
 
 	ASSERT(txg != 0);
 
 	/*
 	 * Due to our use of dn_nlevels below, this can only be called
 	 * in open context, unless we are operating on the MOS.
 	 * From syncing context, dn_nlevels may be different from the
 	 * dn_nlevels used when dbuf was dirtied.
 	 */
 	ASSERT(db->db_objset ==
 	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
 	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT0(db->db_level);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	/*
 	 * If this buffer is not dirty, we're done.
 	 */
 	dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg);
 	if (dr == NULL)
 		return (B_FALSE);
 	ASSERT(dr->dr_dbuf == db);
 
 	brtwrite = dr->dt.dl.dr_brtwrite;
 	if (brtwrite) {
 		/*
 		 * We are freeing a block that we cloned in the same
 		 * transaction group.
 		 */
 		brt_pending_remove(dmu_objset_spa(db->db_objset),
 		    &dr->dt.dl.dr_overridden_by, tx);
 	}
 
 	dnode_t *dn = dr->dr_dnode;
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
 	ASSERT(db->db.db_size != 0);
 
 	dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
 	    dr->dr_accounted, txg);
 
 	list_remove(&db->db_dirty_records, dr);
 
 	/*
 	 * Note that there are three places in dbuf_dirty()
 	 * where this dirty record may be put on a list.
 	 * Make sure to do a list_remove corresponding to
 	 * every one of those list_insert calls.
 	 */
 	if (dr->dr_parent) {
 		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
 	} else if (db->db_blkid == DMU_SPILL_BLKID ||
 	    db->db_level + 1 == dn->dn_nlevels) {
 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	if (db->db_state != DB_NOFILL && !brtwrite) {
 		dbuf_unoverride(dr);
 
 		ASSERT(db->db_buf != NULL);
 		ASSERT(dr->dt.dl.dr_data != NULL);
 		if (dr->dt.dl.dr_data != db->db_buf)
 			arc_buf_destroy(dr->dt.dl.dr_data, db);
 	}
 
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 
 	if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
 		ASSERT(db->db_state == DB_NOFILL || brtwrite ||
 		    arc_released(db->db_buf));
 		dbuf_destroy(db);
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 static void
 dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	boolean_t undirty = B_FALSE;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
 	/*
 	 * Quick check for dirtiness.  For already dirty blocks, this
 	 * reduces runtime of this function by >90%, and overall performance
 	 * by 50% for some workloads (e.g. file deletion with indirect blocks
 	 * cached).
 	 */
 	mutex_enter(&db->db_mtx);
 
 	if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {
 		dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 		/*
 		 * It's possible that it is already dirty but not cached,
 		 * because there are some calls to dbuf_dirty() that don't
 		 * go through dmu_buf_will_dirty().
 		 */
 		if (dr != NULL) {
 			if (dr->dt.dl.dr_brtwrite) {
 				/*
 				 * Block cloning: If we are dirtying a cloned
 				 * block, we cannot simply redirty it, because
 				 * this dr has no data associated with it.
 				 * We will go through a full undirtying below,
 				 * before dirtying it again.
 				 */
 				undirty = B_TRUE;
 			} else {
 				/* This dbuf is already dirty and cached. */
 				dbuf_redirty(dr);
 				mutex_exit(&db->db_mtx);
 				return;
 			}
 		}
 	}
 	mutex_exit(&db->db_mtx);
 
 	DB_DNODE_ENTER(db);
 	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
 		flags |= DB_RF_HAVESTRUCT;
 	DB_DNODE_EXIT(db);
 
 	/*
 	 * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
 	 * want to make sure dbuf_read() will read the pending cloned block and
 	 * not the uderlying block that is being replaced. dbuf_undirty() will
 	 * do dbuf_unoverride(), so we will end up with cloned block content,
 	 * without overridden BP.
 	 */
 	(void) dbuf_read(db, NULL, flags);
 	if (undirty) {
 		mutex_enter(&db->db_mtx);
 		VERIFY(!dbuf_undirty(db, tx));
 		mutex_exit(&db->db_mtx);
 	}
 	(void) dbuf_dirty(db, tx);
 }
 
 void
 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_will_dirty_impl(db_fake,
 	    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
 }
 
 boolean_t
 dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dbuf_dirty_record_t *dr;
 
 	mutex_enter(&db->db_mtx);
 	dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 	mutex_exit(&db->db_mtx);
 	return (dr != NULL);
 }
 
 void
 dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	/*
 	 * Block cloning: We are going to clone into this block, so undirty
 	 * modifications done to this block so far in this txg. This includes
 	 * writes and clones into this block.
 	 */
 	mutex_enter(&db->db_mtx);
 	DBUF_VERIFY(db);
 	VERIFY(!dbuf_undirty(db, tx));
 	ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
 	if (db->db_buf != NULL) {
 		arc_buf_destroy(db->db_buf, db);
 		db->db_buf = NULL;
 		dbuf_clear_data(db);
 	}
 
 	db->db_state = DB_NOFILL;
 	DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone");
 
 	DBUF_VERIFY(db);
 	mutex_exit(&db->db_mtx);
 
 	dbuf_noread(db);
 	(void) dbuf_dirty(db, tx);
 }
 
 void
 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	mutex_enter(&db->db_mtx);
 	db->db_state = DB_NOFILL;
 	DTRACE_SET_STATE(db, "allocating NOFILL buffer");
 	mutex_exit(&db->db_mtx);
 
 	dbuf_noread(db);
 	(void) dbuf_dirty(db, tx);
 }
 
 void
 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(db->db_level == 0);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
 	    dmu_tx_private_ok(tx));
 
 	mutex_enter(&db->db_mtx);
 	if (db->db_state == DB_NOFILL) {
 		/*
 		 * Block cloning: We will be completely overwriting a block
 		 * cloned in this transaction group, so let's undirty the
 		 * pending clone and mark the block as uncached. This will be
 		 * as if the clone was never done.  But if the fill can fail
 		 * we should have a way to return back to the cloned data.
 		 */
 		if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
 			mutex_exit(&db->db_mtx);
 			dmu_buf_will_dirty(db_fake, tx);
 			return;
 		}
 		VERIFY(!dbuf_undirty(db, tx));
 		db->db_state = DB_UNCACHED;
 	}
 	mutex_exit(&db->db_mtx);
 
 	dbuf_noread(db);
 	(void) dbuf_dirty(db, tx);
 }
 
 /*
  * This function is effectively the same as dmu_buf_will_dirty(), but
  * indicates the caller expects raw encrypted data in the db, and provides
  * the crypt params (byteorder, salt, iv, mac) which should be stored in the
  * blkptr_t when this dbuf is written.  This is only used for blocks of
  * dnodes, during raw receive.
  */
 void
 dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dbuf_dirty_record_t *dr;
 
 	/*
 	 * dr_has_raw_params is only processed for blocks of dnodes
 	 * (see dbuf_sync_dnode_leaf_crypt()).
 	 */
 	ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
 	ASSERT3U(db->db_level, ==, 0);
 	ASSERT(db->db_objset->os_raw_receive);
 
 	dmu_buf_will_dirty_impl(db_fake,
 	    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
 
 	dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 
 	ASSERT3P(dr, !=, NULL);
 
 	dr->dt.dl.dr_has_raw_params = B_TRUE;
 	dr->dt.dl.dr_byteorder = byteorder;
 	memcpy(dr->dt.dl.dr_salt, salt, ZIO_DATA_SALT_LEN);
 	memcpy(dr->dt.dl.dr_iv, iv, ZIO_DATA_IV_LEN);
 	memcpy(dr->dt.dl.dr_mac, mac, ZIO_DATA_MAC_LEN);
 }
 
 static void
 dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	struct dirty_leaf *dl;
 	dbuf_dirty_record_t *dr;
 
 	dr = list_head(&db->db_dirty_records);
 	ASSERT3P(dr, !=, NULL);
 	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 	dl = &dr->dt.dl;
 	dl->dr_overridden_by = *bp;
 	dl->dr_override_state = DR_OVERRIDDEN;
-	dl->dr_overridden_by.blk_birth = dr->dr_txg;
+	BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
 }
 
 boolean_t
 dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
 {
 	(void) tx;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
 	mutex_enter(&db->db_mtx);
 	DBUF_VERIFY(db);
 
 	if (db->db_state == DB_FILL) {
 		if (db->db_level == 0 && db->db_freed_in_flight) {
 			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 			/* we were freed while filling */
 			/* XXX dbuf_undirty? */
 			memset(db->db.db_data, 0, db->db.db_size);
 			db->db_freed_in_flight = FALSE;
 			db->db_state = DB_CACHED;
 			DTRACE_SET_STATE(db,
 			    "fill done handling freed in flight");
 			failed = B_FALSE;
 		} else if (failed) {
 			VERIFY(!dbuf_undirty(db, tx));
 			db->db_buf = NULL;
 			dbuf_clear_data(db);
 			DTRACE_SET_STATE(db, "fill failed");
 		} else {
 			db->db_state = DB_CACHED;
 			DTRACE_SET_STATE(db, "fill done");
 		}
 		cv_broadcast(&db->db_changed);
 	} else {
 		db->db_state = DB_CACHED;
 		failed = B_FALSE;
 	}
 	mutex_exit(&db->db_mtx);
 	return (failed);
 }
 
 void
 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
     bp_embedded_type_t etype, enum zio_compress comp,
     int uncompressed_size, int compressed_size, int byteorder,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
 	struct dirty_leaf *dl;
 	dmu_object_type_t type;
 	dbuf_dirty_record_t *dr;
 
 	if (etype == BP_EMBEDDED_TYPE_DATA) {
 		ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
 		    SPA_FEATURE_EMBEDDED_DATA));
 	}
 
 	DB_DNODE_ENTER(db);
 	type = DB_DNODE(db)->dn_type;
 	DB_DNODE_EXIT(db);
 
 	ASSERT0(db->db_level);
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 	dmu_buf_will_not_fill(dbuf, tx);
 
 	dr = list_head(&db->db_dirty_records);
 	ASSERT3P(dr, !=, NULL);
 	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 	dl = &dr->dt.dl;
 	encode_embedded_bp_compressed(&dl->dr_overridden_by,
 	    data, comp, uncompressed_size, compressed_size);
 	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
 	BP_SET_TYPE(&dl->dr_overridden_by, type);
 	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
 	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
 
 	dl->dr_override_state = DR_OVERRIDDEN;
-	dl->dr_overridden_by.blk_birth = dr->dr_txg;
+	BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
 }
 
 void
 dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
 	dmu_object_type_t type;
 	ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset,
 	    SPA_FEATURE_REDACTED_DATASETS));
 
 	DB_DNODE_ENTER(db);
 	type = DB_DNODE(db)->dn_type;
 	DB_DNODE_EXIT(db);
 
 	ASSERT0(db->db_level);
 	dmu_buf_will_not_fill(dbuf, tx);
 
 	blkptr_t bp = { { { {0} } } };
 	BP_SET_TYPE(&bp, type);
 	BP_SET_LEVEL(&bp, 0);
 	BP_SET_BIRTH(&bp, tx->tx_txg, 0);
 	BP_SET_REDACTED(&bp);
 	BPE_SET_LSIZE(&bp, dbuf->db_size);
 
 	dbuf_override_impl(db, &bp, tx);
 }
 
 /*
  * Directly assign a provided arc buf to a given dbuf if it's not referenced
  * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
  */
 void
 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 {
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(db->db_level == 0);
 	ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
 	ASSERT(buf != NULL);
 	ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size);
 	ASSERT(tx->tx_txg != 0);
 
 	arc_return_buf(buf, db);
 	ASSERT(arc_released(buf));
 
 	mutex_enter(&db->db_mtx);
 
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 
 	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED ||
 	    db->db_state == DB_NOFILL);
 
 	if (db->db_state == DB_CACHED &&
 	    zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
 		/*
 		 * In practice, we will never have a case where we have an
 		 * encrypted arc buffer while additional holds exist on the
 		 * dbuf. We don't handle this here so we simply assert that
 		 * fact instead.
 		 */
 		ASSERT(!arc_is_encrypted(buf));
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_dirty(db, tx);
 		memcpy(db->db.db_data, buf->b_data, db->db.db_size);
 		arc_buf_destroy(buf, db);
 		return;
 	}
 
 	if (db->db_state == DB_CACHED) {
 		dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
 
 		ASSERT(db->db_buf != NULL);
 		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
 			ASSERT(dr->dt.dl.dr_data == db->db_buf);
 
 			if (!arc_released(db->db_buf)) {
 				ASSERT(dr->dt.dl.dr_override_state ==
 				    DR_OVERRIDDEN);
 				arc_release(db->db_buf, db);
 			}
 			dr->dt.dl.dr_data = buf;
 			arc_buf_destroy(db->db_buf, db);
 		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
 			arc_release(db->db_buf, db);
 			arc_buf_destroy(db->db_buf, db);
 		}
 		db->db_buf = NULL;
 	} else if (db->db_state == DB_NOFILL) {
 		/*
 		 * We will be completely replacing the cloned block.  In case
 		 * it was cloned in this transaction group, let's undirty the
 		 * pending clone and mark the block as uncached. This will be
 		 * as if the clone was never done.
 		 */
 		VERIFY(!dbuf_undirty(db, tx));
 		db->db_state = DB_UNCACHED;
 	}
 	ASSERT(db->db_buf == NULL);
 	dbuf_set_data(db, buf);
 	db->db_state = DB_FILL;
 	DTRACE_SET_STATE(db, "filling assigned arcbuf");
 	mutex_exit(&db->db_mtx);
 	(void) dbuf_dirty(db, tx);
 	dmu_buf_fill_done(&db->db, tx, B_FALSE);
 }
 
 void
 dbuf_destroy(dmu_buf_impl_t *db)
 {
 	dnode_t *dn;
 	dmu_buf_impl_t *parent = db->db_parent;
 	dmu_buf_impl_t *dndb;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(zfs_refcount_is_zero(&db->db_holds));
 
 	if (db->db_buf != NULL) {
 		arc_buf_destroy(db->db_buf, db);
 		db->db_buf = NULL;
 	}
 
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		int slots = DB_DNODE(db)->dn_num_slots;
 		int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
 		if (db->db.db_data != NULL) {
 			kmem_free(db->db.db_data, bonuslen);
 			arc_space_return(bonuslen, ARC_SPACE_BONUS);
 			db->db_state = DB_UNCACHED;
 			DTRACE_SET_STATE(db, "buffer cleared");
 		}
 	}
 
 	dbuf_clear_data(db);
 
 	if (multilist_link_active(&db->db_cache_link)) {
 		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
 		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
 
 		multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
 
 		ASSERT0(dmu_buf_user_size(&db->db));
 		(void) zfs_refcount_remove_many(
 		    &dbuf_caches[db->db_caching_status].size,
 		    db->db.db_size, db);
 
 		if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
 			DBUF_STAT_BUMPDOWN(metadata_cache_count);
 		} else {
 			DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
 			DBUF_STAT_BUMPDOWN(cache_count);
 			DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
 			    db->db.db_size);
 		}
 		db->db_caching_status = DB_NO_CACHE;
 	}
 
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
 	ASSERT(db->db_data_pending == NULL);
 	ASSERT(list_is_empty(&db->db_dirty_records));
 
 	db->db_state = DB_EVICTING;
 	DTRACE_SET_STATE(db, "buffer eviction started");
 	db->db_blkptr = NULL;
 
 	/*
 	 * Now that db_state is DB_EVICTING, nobody else can find this via
 	 * the hash table.  We can now drop db_mtx, which allows us to
 	 * acquire the dn_dbufs_mtx.
 	 */
 	mutex_exit(&db->db_mtx);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	dndb = dn->dn_dbuf;
 	if (db->db_blkid != DMU_BONUS_BLKID) {
 		boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
 		if (needlock)
 			mutex_enter_nested(&dn->dn_dbufs_mtx,
 			    NESTED_SINGLE);
 		avl_remove(&dn->dn_dbufs, db);
 		membar_producer();
 		DB_DNODE_EXIT(db);
 		if (needlock)
 			mutex_exit(&dn->dn_dbufs_mtx);
 		/*
 		 * Decrementing the dbuf count means that the hold corresponding
 		 * to the removed dbuf is no longer discounted in dnode_move(),
 		 * so the dnode cannot be moved until after we release the hold.
 		 * The membar_producer() ensures visibility of the decremented
 		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
 		 * release any lock.
 		 */
 		mutex_enter(&dn->dn_mtx);
 		dnode_rele_and_unlock(dn, db, B_TRUE);
 		db->db_dnode_handle = NULL;
 
 		dbuf_hash_remove(db);
 	} else {
 		DB_DNODE_EXIT(db);
 	}
 
 	ASSERT(zfs_refcount_is_zero(&db->db_holds));
 
 	db->db_parent = NULL;
 
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db.db_data == NULL);
 	ASSERT(db->db_hash_next == NULL);
 	ASSERT(db->db_blkptr == NULL);
 	ASSERT(db->db_data_pending == NULL);
 	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
 	ASSERT(!multilist_link_active(&db->db_cache_link));
 
 	/*
 	 * If this dbuf is referenced from an indirect dbuf,
 	 * decrement the ref count on the indirect dbuf.
 	 */
 	if (parent && parent != dndb) {
 		mutex_enter(&parent->db_mtx);
 		dbuf_rele_and_unlock(parent, db, B_TRUE);
 	}
 
 	kmem_cache_free(dbuf_kmem_cache, db);
 	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
 }
 
 /*
  * Note: While bpp will always be updated if the function returns success,
  * parentp will not be updated if the dnode does not have dn_dbuf filled in;
  * this happens when the dnode is the meta-dnode, or {user|group|project}used
  * object.
  */
 __attribute__((always_inline))
 static inline int
 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
     dmu_buf_impl_t **parentp, blkptr_t **bpp)
 {
 	*parentp = NULL;
 	*bpp = NULL;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 
 	if (blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (dn->dn_have_spill &&
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
 			*bpp = DN_SPILL_BLKPTR(dn->dn_phys);
 		else
 			*bpp = NULL;
 		dbuf_add_ref(dn->dn_dbuf, NULL);
 		*parentp = dn->dn_dbuf;
 		mutex_exit(&dn->dn_mtx);
 		return (0);
 	}
 
 	int nlevels =
 	    (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
 	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 	ASSERT3U(level * epbs, <, 64);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	/*
 	 * This assertion shouldn't trip as long as the max indirect block size
 	 * is less than 1M.  The reason for this is that up to that point,
 	 * the number of levels required to address an entire object with blocks
 	 * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64.	 In
 	 * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
 	 * (i.e. we can address the entire object), objects will all use at most
 	 * N-1 levels and the assertion won't overflow.	 However, once epbs is
 	 * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66.  Then, 4 levels will not be
 	 * enough to address an entire object, so objects will have 5 levels,
 	 * but then this assertion will overflow.
 	 *
 	 * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
 	 * need to redo this logic to handle overflows.
 	 */
 	ASSERT(level >= nlevels ||
 	    ((nlevels - level - 1) * epbs) +
 	    highbit64(dn->dn_phys->dn_nblkptr) <= 64);
 	if (level >= nlevels ||
 	    blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
 	    ((nlevels - level - 1) * epbs)) ||
 	    (fail_sparse &&
 	    blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
 		/* the buffer has no parent yet */
 		return (SET_ERROR(ENOENT));
 	} else if (level < nlevels-1) {
 		/* this block is referenced from an indirect block */
 		int err;
 
 		err = dbuf_hold_impl(dn, level + 1,
 		    blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
 
 		if (err)
 			return (err);
 		err = dbuf_read(*parentp, NULL,
 		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
 		if (err) {
 			dbuf_rele(*parentp, NULL);
 			*parentp = NULL;
 			return (err);
 		}
 		rw_enter(&(*parentp)->db_rwlock, RW_READER);
 		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
 		    (blkid & ((1ULL << epbs) - 1));
 		if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
 			ASSERT(BP_IS_HOLE(*bpp));
 		rw_exit(&(*parentp)->db_rwlock);
 		return (0);
 	} else {
 		/* the block is referenced from the dnode */
 		ASSERT3U(level, ==, nlevels-1);
 		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
 		    blkid < dn->dn_phys->dn_nblkptr);
 		if (dn->dn_dbuf) {
 			dbuf_add_ref(dn->dn_dbuf, NULL);
 			*parentp = dn->dn_dbuf;
 		}
 		*bpp = &dn->dn_phys->dn_blkptr[blkid];
 		return (0);
 	}
 }
 
 static dmu_buf_impl_t *
 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
     dmu_buf_impl_t *parent, blkptr_t *blkptr, uint64_t hash)
 {
 	objset_t *os = dn->dn_objset;
 	dmu_buf_impl_t *db, *odb;
 
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT(dn->dn_type != DMU_OT_NONE);
 
 	db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
 
 	list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t),
 	    offsetof(dbuf_dirty_record_t, dr_dbuf_node));
 
 	db->db_objset = os;
 	db->db.db_object = dn->dn_object;
 	db->db_level = level;
 	db->db_blkid = blkid;
 	db->db_dirtycnt = 0;
 	db->db_dnode_handle = dn->dn_handle;
 	db->db_parent = parent;
 	db->db_blkptr = blkptr;
 	db->db_hash = hash;
 
 	db->db_user = NULL;
 	db->db_user_immediate_evict = FALSE;
 	db->db_freed_in_flight = FALSE;
 	db->db_pending_evict = FALSE;
 
 	if (blkid == DMU_BONUS_BLKID) {
 		ASSERT3P(parent, ==, dn->dn_dbuf);
 		db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
 		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		db->db.db_offset = DMU_BONUS_BLKID;
 		db->db_state = DB_UNCACHED;
 		DTRACE_SET_STATE(db, "bonus buffer created");
 		db->db_caching_status = DB_NO_CACHE;
 		/* the bonus dbuf is not placed in the hash table */
 		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
 		return (db);
 	} else if (blkid == DMU_SPILL_BLKID) {
 		db->db.db_size = (blkptr != NULL) ?
 		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
 		db->db.db_offset = 0;
 	} else {
 		int blocksize =
 		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
 		db->db.db_size = blocksize;
 		db->db.db_offset = db->db_blkid * blocksize;
 	}
 
 	/*
 	 * Hold the dn_dbufs_mtx while we get the new dbuf
 	 * in the hash table *and* added to the dbufs list.
 	 * This prevents a possible deadlock with someone
 	 * trying to look up this dbuf before it's added to the
 	 * dn_dbufs list.
 	 */
 	mutex_enter(&dn->dn_dbufs_mtx);
 	db->db_state = DB_EVICTING; /* not worth logging this state change */
 	if ((odb = dbuf_hash_insert(db)) != NULL) {
 		/* someone else inserted it first */
 		mutex_exit(&dn->dn_dbufs_mtx);
 		kmem_cache_free(dbuf_kmem_cache, db);
 		DBUF_STAT_BUMP(hash_insert_race);
 		return (odb);
 	}
 	avl_add(&dn->dn_dbufs, db);
 
 	db->db_state = DB_UNCACHED;
 	DTRACE_SET_STATE(db, "regular buffer created");
 	db->db_caching_status = DB_NO_CACHE;
 	mutex_exit(&dn->dn_dbufs_mtx);
 	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
 
 	if (parent && parent != dn->dn_dbuf)
 		dbuf_add_ref(parent, db);
 
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    zfs_refcount_count(&dn->dn_holds) > 0);
 	(void) zfs_refcount_add(&dn->dn_holds, db);
 
 	dprintf_dbuf(db, "db=%p\n", db);
 
 	return (db);
 }
 
 /*
  * This function returns a block pointer and information about the object,
  * given a dnode and a block.  This is a publicly accessible version of
  * dbuf_findbp that only returns some information, rather than the
  * dbuf.  Note that the dnode passed in must be held, and the dn_struct_rwlock
  * should be locked as (at least) a reader.
  */
 int
 dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,
     blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift)
 {
 	dmu_buf_impl_t *dbp = NULL;
 	blkptr_t *bp2;
 	int err = 0;
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 
 	err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);
 	if (err == 0) {
 		ASSERT3P(bp2, !=, NULL);
 		*bp = *bp2;
 		if (dbp != NULL)
 			dbuf_rele(dbp, NULL);
 		if (datablkszsec != NULL)
 			*datablkszsec = dn->dn_phys->dn_datablkszsec;
 		if (indblkshift != NULL)
 			*indblkshift = dn->dn_phys->dn_indblkshift;
 	}
 
 	return (err);
 }
 
 typedef struct dbuf_prefetch_arg {
 	spa_t *dpa_spa;	/* The spa to issue the prefetch in. */
 	zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
 	int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
 	int dpa_curlevel; /* The current level that we're reading */
 	dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
 	zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
 	zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
 	arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
 	dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */
 	void *dpa_arg; /* prefetch completion arg */
 } dbuf_prefetch_arg_t;
 
 static void
 dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)
 {
 	if (dpa->dpa_cb != NULL) {
 		dpa->dpa_cb(dpa->dpa_arg, dpa->dpa_zb.zb_level,
 		    dpa->dpa_zb.zb_blkid, io_done);
 	}
 	kmem_free(dpa, sizeof (*dpa));
 }
 
 static void
 dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,
     const blkptr_t *iobp, arc_buf_t *abuf, void *private)
 {
 	(void) zio, (void) zb, (void) iobp;
 	dbuf_prefetch_arg_t *dpa = private;
 
 	if (abuf != NULL)
 		arc_buf_destroy(abuf, private);
 
 	dbuf_prefetch_fini(dpa, B_TRUE);
 }
 
 /*
  * Actually issue the prefetch read for the block given.
  */
 static void
 dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
 {
 	ASSERT(!BP_IS_REDACTED(bp) ||
 	    dsl_dataset_feature_is_active(
 	    dpa->dpa_dnode->dn_objset->os_dsl_dataset,
 	    SPA_FEATURE_REDACTED_DATASETS));
 
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
 		return (dbuf_prefetch_fini(dpa, B_FALSE));
 
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
 	arc_flags_t aflags =
 	    dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
 	    ARC_FLAG_NO_BUF;
 
 	/* dnodes are always read as raw and then converted later */
 	if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) &&
 	    dpa->dpa_curlevel == 0)
 		zio_flags |= ZIO_FLAG_RAW;
 
 	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
 	ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
 	ASSERT(dpa->dpa_zio != NULL);
 	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp,
 	    dbuf_issue_final_prefetch_done, dpa,
 	    dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
 }
 
 /*
  * Called when an indirect block above our prefetch target is read in.  This
  * will either read in the next indirect block down the tree or issue the actual
  * prefetch if the next block down is our target.
  */
 static void
 dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
     const blkptr_t *iobp, arc_buf_t *abuf, void *private)
 {
 	(void) zb, (void) iobp;
 	dbuf_prefetch_arg_t *dpa = private;
 
 	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
 	ASSERT3S(dpa->dpa_curlevel, >, 0);
 
 	if (abuf == NULL) {
 		ASSERT(zio == NULL || zio->io_error != 0);
 		dbuf_prefetch_fini(dpa, B_TRUE);
 		return;
 	}
 	ASSERT(zio == NULL || zio->io_error == 0);
 
 	/*
 	 * The dpa_dnode is only valid if we are called with a NULL
 	 * zio. This indicates that the arc_read() returned without
 	 * first calling zio_read() to issue a physical read. Once
 	 * a physical read is made the dpa_dnode must be invalidated
 	 * as the locks guarding it may have been dropped. If the
 	 * dpa_dnode is still valid, then we want to add it to the dbuf
 	 * cache. To do so, we must hold the dbuf associated with the block
 	 * we just prefetched, read its contents so that we associate it
 	 * with an arc_buf_t, and then release it.
 	 */
 	if (zio != NULL) {
 		ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
 		if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
 			ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
 		} else {
 			ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
 		}
 		ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
 
 		dpa->dpa_dnode = NULL;
 	} else if (dpa->dpa_dnode != NULL) {
 		uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
 		    (dpa->dpa_epbs * (dpa->dpa_curlevel -
 		    dpa->dpa_zb.zb_level));
 		dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
 		    dpa->dpa_curlevel, curblkid, FTAG);
 		if (db == NULL) {
 			arc_buf_destroy(abuf, private);
 			dbuf_prefetch_fini(dpa, B_TRUE);
 			return;
 		}
 		(void) dbuf_read(db, NULL,
 		    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
 		dbuf_rele(db, FTAG);
 	}
 
 	dpa->dpa_curlevel--;
 	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
 	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
 	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
 	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
 
 	ASSERT(!BP_IS_REDACTED(bp) || (dpa->dpa_dnode &&
 	    dsl_dataset_feature_is_active(
 	    dpa->dpa_dnode->dn_objset->os_dsl_dataset,
 	    SPA_FEATURE_REDACTED_DATASETS)));
 	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
 		arc_buf_destroy(abuf, private);
 		dbuf_prefetch_fini(dpa, B_TRUE);
 		return;
 	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
 		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
 		dbuf_issue_final_prefetch(dpa, bp);
 	} else {
 		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
 		zbookmark_phys_t zb;
 
 		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
 		if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
 			iter_aflags |= ARC_FLAG_L2CACHE;
 
 		ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
 
 		SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
 		    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
 
 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
 		    bp, dbuf_prefetch_indirect_done, dpa,
 		    ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 		    &iter_aflags, &zb);
 	}
 
 	arc_buf_destroy(abuf, private);
 }
 
 /*
  * Issue prefetch reads for the given block on the given level.  If the indirect
  * blocks above that block are not in memory, we will read them in
  * asynchronously.  As a result, this call never blocks waiting for a read to
  * complete. Note that the prefetch might fail if the dataset is encrypted and
  * the encryption key is unmapped before the IO completes.
  */
 int
 dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
     zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
     void *arg)
 {
 	blkptr_t bp;
 	int epbs, nlevels, curlevel;
 	uint64_t curblkid;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 
 	if (blkid > dn->dn_maxblkid)
 		goto no_issue;
 
 	if (level == 0 && dnode_block_freed(dn, blkid))
 		goto no_issue;
 
 	/*
 	 * This dnode hasn't been written to disk yet, so there's nothing to
 	 * prefetch.
 	 */
 	nlevels = dn->dn_phys->dn_nlevels;
 	if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
 		goto no_issue;
 
 	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
 		goto no_issue;
 
 	dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
 	    level, blkid, NULL);
 	if (db != NULL) {
 		mutex_exit(&db->db_mtx);
 		/*
 		 * This dbuf already exists.  It is either CACHED, or
 		 * (we assume) about to be read or filled.
 		 */
 		goto no_issue;
 	}
 
 	/*
 	 * Find the closest ancestor (indirect block) of the target block
 	 * that is present in the cache.  In this indirect block, we will
 	 * find the bp that is at curlevel, curblkid.
 	 */
 	curlevel = level;
 	curblkid = blkid;
 	while (curlevel < nlevels - 1) {
 		int parent_level = curlevel + 1;
 		uint64_t parent_blkid = curblkid >> epbs;
 		dmu_buf_impl_t *db;
 
 		if (dbuf_hold_impl(dn, parent_level, parent_blkid,
 		    FALSE, TRUE, FTAG, &db) == 0) {
 			blkptr_t *bpp = db->db_buf->b_data;
 			bp = bpp[P2PHASE(curblkid, 1 << epbs)];
 			dbuf_rele(db, FTAG);
 			break;
 		}
 
 		curlevel = parent_level;
 		curblkid = parent_blkid;
 	}
 
 	if (curlevel == nlevels - 1) {
 		/* No cached indirect blocks found. */
 		ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
 		bp = dn->dn_phys->dn_blkptr[curblkid];
 	}
 	ASSERT(!BP_IS_REDACTED(&bp) ||
 	    dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,
 	    SPA_FEATURE_REDACTED_DATASETS));
 	if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))
 		goto no_issue;
 
 	ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
 
 	zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
 	    ZIO_FLAG_CANFAIL);
 
 	dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 	SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
 	    dn->dn_object, level, blkid);
 	dpa->dpa_curlevel = curlevel;
 	dpa->dpa_prio = prio;
 	dpa->dpa_aflags = aflags;
 	dpa->dpa_spa = dn->dn_objset->os_spa;
 	dpa->dpa_dnode = dn;
 	dpa->dpa_epbs = epbs;
 	dpa->dpa_zio = pio;
 	dpa->dpa_cb = cb;
 	dpa->dpa_arg = arg;
 
 	if (!DNODE_LEVEL_IS_CACHEABLE(dn, level))
 		dpa->dpa_aflags |= ARC_FLAG_UNCACHED;
 	else if (dnode_level_is_l2cacheable(&bp, dn, level))
 		dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
 
 	/*
 	 * If we have the indirect just above us, no need to do the asynchronous
 	 * prefetch chain; we'll just run the last step ourselves.  If we're at
 	 * a higher level, though, we want to issue the prefetches for all the
 	 * indirect blocks asynchronously, so we can go on with whatever we were
 	 * doing.
 	 */
 	if (curlevel == level) {
 		ASSERT3U(curblkid, ==, blkid);
 		dbuf_issue_final_prefetch(dpa, &bp);
 	} else {
 		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
 		zbookmark_phys_t zb;
 
 		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
 		if (dnode_level_is_l2cacheable(&bp, dn, level))
 			iter_aflags |= ARC_FLAG_L2CACHE;
 
 		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
 		    dn->dn_object, curlevel, curblkid);
 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
 		    &bp, dbuf_prefetch_indirect_done, dpa,
 		    ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 		    &iter_aflags, &zb);
 	}
 	/*
 	 * We use pio here instead of dpa_zio since it's possible that
 	 * dpa may have already been freed.
 	 */
 	zio_nowait(pio);
 	return (1);
 no_issue:
 	if (cb != NULL)
 		cb(arg, level, blkid, B_FALSE);
 	return (0);
 }
 
 int
 dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
     arc_flags_t aflags)
 {
 
 	return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL));
 }
 
 /*
  * Helper function for dbuf_hold_impl() to copy a buffer. Handles
  * the case of encrypted, compressed and uncompressed buffers by
  * allocating the new buffer, respectively, with arc_alloc_raw_buf(),
  * arc_alloc_compressed_buf() or arc_alloc_buf().*
  *
  * NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl().
  */
 noinline static void
 dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
 {
 	dbuf_dirty_record_t *dr = db->db_data_pending;
 	arc_buf_t *data = dr->dt.dl.dr_data;
 	enum zio_compress compress_type = arc_get_compression(data);
 	uint8_t complevel = arc_get_complevel(data);
 
 	if (arc_is_encrypted(data)) {
 		boolean_t byteorder;
 		uint8_t salt[ZIO_DATA_SALT_LEN];
 		uint8_t iv[ZIO_DATA_IV_LEN];
 		uint8_t mac[ZIO_DATA_MAC_LEN];
 
 		arc_get_raw_params(data, &byteorder, salt, iv, mac);
 		dbuf_set_data(db, arc_alloc_raw_buf(dn->dn_objset->os_spa, db,
 		    dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac,
 		    dn->dn_type, arc_buf_size(data), arc_buf_lsize(data),
 		    compress_type, complevel));
 	} else if (compress_type != ZIO_COMPRESS_OFF) {
 		dbuf_set_data(db, arc_alloc_compressed_buf(
 		    dn->dn_objset->os_spa, db, arc_buf_size(data),
 		    arc_buf_lsize(data), compress_type, complevel));
 	} else {
 		dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db,
 		    DBUF_GET_BUFC_TYPE(db), db->db.db_size));
 	}
 
 	rw_enter(&db->db_rwlock, RW_WRITER);
 	memcpy(db->db.db_data, data->b_data, arc_buf_size(data));
 	rw_exit(&db->db_rwlock);
 }
 
 /*
  * Returns with db_holds incremented, and db_mtx not held.
  * Note: dn_struct_rwlock must be held.
  */
 int
 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
     boolean_t fail_sparse, boolean_t fail_uncached,
     const void *tag, dmu_buf_impl_t **dbp)
 {
 	dmu_buf_impl_t *db, *parent = NULL;
 	uint64_t hv;
 
 	/* If the pool has been created, verify the tx_sync_lock is not held */
 	spa_t *spa = dn->dn_objset->os_spa;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	if (dp != NULL) {
 		ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock));
 	}
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT3U(dn->dn_nlevels, >, level);
 
 	*dbp = NULL;
 
 	/* dbuf_find() returns with db_mtx held */
 	db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, &hv);
 
 	if (db == NULL) {
 		blkptr_t *bp = NULL;
 		int err;
 
 		if (fail_uncached)
 			return (SET_ERROR(ENOENT));
 
 		ASSERT3P(parent, ==, NULL);
 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
 		if (fail_sparse) {
 			if (err == 0 && bp && BP_IS_HOLE(bp))
 				err = SET_ERROR(ENOENT);
 			if (err) {
 				if (parent)
 					dbuf_rele(parent, NULL);
 				return (err);
 			}
 		}
 		if (err && err != ENOENT)
 			return (err);
 		db = dbuf_create(dn, level, blkid, parent, bp, hv);
 	}
 
 	if (fail_uncached && db->db_state != DB_CACHED) {
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (db->db_buf != NULL) {
 		arc_buf_access(db->db_buf);
 		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
 	}
 
 	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
 
 	/*
 	 * If this buffer is currently syncing out, and we are
 	 * still referencing it from db_data, we need to make a copy
 	 * of it in case we decide we want to dirty it again in this txg.
 	 */
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    db->db_state == DB_CACHED && db->db_data_pending) {
 		dbuf_dirty_record_t *dr = db->db_data_pending;
 		if (dr->dt.dl.dr_data == db->db_buf) {
 			ASSERT3P(db->db_buf, !=, NULL);
 			dbuf_hold_copy(dn, db);
 		}
 	}
 
 	if (multilist_link_active(&db->db_cache_link)) {
 		ASSERT(zfs_refcount_is_zero(&db->db_holds));
 		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
 		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
 
 		multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
 
 		uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
 		(void) zfs_refcount_remove_many(
 		    &dbuf_caches[db->db_caching_status].size, size, db);
 
 		if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
 			DBUF_STAT_BUMPDOWN(metadata_cache_count);
 		} else {
 			DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
 			DBUF_STAT_BUMPDOWN(cache_count);
 			DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
 		}
 		db->db_caching_status = DB_NO_CACHE;
 	}
 	(void) zfs_refcount_add(&db->db_holds, tag);
 	DBUF_VERIFY(db);
 	mutex_exit(&db->db_mtx);
 
 	/* NOTE: we can't rele the parent until after we drop the db_mtx */
 	if (parent)
 		dbuf_rele(parent, NULL);
 
 	ASSERT3P(DB_DNODE(db), ==, dn);
 	ASSERT3U(db->db_blkid, ==, blkid);
 	ASSERT3U(db->db_level, ==, level);
 	*dbp = db;
 
 	return (0);
 }
 
 dmu_buf_impl_t *
 dbuf_hold(dnode_t *dn, uint64_t blkid, const void *tag)
 {
 	return (dbuf_hold_level(dn, 0, blkid, tag));
 }
 
 dmu_buf_impl_t *
 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, const void *tag)
 {
 	dmu_buf_impl_t *db;
 	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
 	return (err ? NULL : db);
 }
 
 void
 dbuf_create_bonus(dnode_t *dn)
 {
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	ASSERT(dn->dn_bonus == NULL);
 	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
 	    dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
 }
 
 int
 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	if (db->db_blkid != DMU_SPILL_BLKID)
 		return (SET_ERROR(ENOTSUP));
 	if (blksz == 0)
 		blksz = SPA_MINBLOCKSIZE;
 	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
 	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
 
 	dbuf_new_size(db, blksz, tx);
 
 	return (0);
 }
 
 void
 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 {
 	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
 }
 
 #pragma weak dmu_buf_add_ref = dbuf_add_ref
 void
 dbuf_add_ref(dmu_buf_impl_t *db, const void *tag)
 {
 	int64_t holds = zfs_refcount_add(&db->db_holds, tag);
 	VERIFY3S(holds, >, 1);
 }
 
 #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
 boolean_t
 dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
     const void *tag)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dmu_buf_impl_t *found_db;
 	boolean_t result = B_FALSE;
 
 	if (blkid == DMU_BONUS_BLKID)
 		found_db = dbuf_find_bonus(os, obj);
 	else
 		found_db = dbuf_find(os, obj, 0, blkid, NULL);
 
 	if (found_db != NULL) {
 		if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
 			(void) zfs_refcount_add(&db->db_holds, tag);
 			result = B_TRUE;
 		}
 		mutex_exit(&found_db->db_mtx);
 	}
 	return (result);
 }
 
 /*
  * If you call dbuf_rele() you had better not be referencing the dnode handle
  * unless you have some other direct or indirect hold on the dnode. (An indirect
  * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
  * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
  * dnode's parent dbuf evicting its dnode handles.
  */
 void
 dbuf_rele(dmu_buf_impl_t *db, const void *tag)
 {
 	mutex_enter(&db->db_mtx);
 	dbuf_rele_and_unlock(db, tag, B_FALSE);
 }
 
 void
 dmu_buf_rele(dmu_buf_t *db, const void *tag)
 {
 	dbuf_rele((dmu_buf_impl_t *)db, tag);
 }
 
 /*
  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
  * db_dirtycnt and db_holds to be updated atomically.  The 'evicting'
  * argument should be set if we are already in the dbuf-evicting code
  * path, in which case we don't want to recursively evict.  This allows us to
  * avoid deeply nested stacks that would have a call flow similar to this:
  *
  * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
  *	^						|
  *	|						|
  *	+-----dbuf_destroy()<--dbuf_evict_one()<--------+
  *
  */
 void
 dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
 {
 	int64_t holds;
 	uint64_t size;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	DBUF_VERIFY(db);
 
 	/*
 	 * Remove the reference to the dbuf before removing its hold on the
 	 * dnode so we can guarantee in dnode_move() that a referenced bonus
 	 * buffer has a corresponding dnode hold.
 	 */
 	holds = zfs_refcount_remove(&db->db_holds, tag);
 	ASSERT(holds >= 0);
 
 	/*
 	 * We can't freeze indirects if there is a possibility that they
 	 * may be modified in the current syncing context.
 	 */
 	if (db->db_buf != NULL &&
 	    holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
 		arc_buf_freeze(db->db_buf);
 	}
 
 	if (holds == db->db_dirtycnt &&
 	    db->db_level == 0 && db->db_user_immediate_evict)
 		dbuf_evict_user(db);
 
 	if (holds == 0) {
 		if (db->db_blkid == DMU_BONUS_BLKID) {
 			dnode_t *dn;
 			boolean_t evict_dbuf = db->db_pending_evict;
 
 			/*
 			 * If the dnode moves here, we cannot cross this
 			 * barrier until the move completes.
 			 */
 			DB_DNODE_ENTER(db);
 
 			dn = DB_DNODE(db);
 			atomic_dec_32(&dn->dn_dbufs_count);
 
 			/*
 			 * Decrementing the dbuf count means that the bonus
 			 * buffer's dnode hold is no longer discounted in
 			 * dnode_move(). The dnode cannot move until after
 			 * the dnode_rele() below.
 			 */
 			DB_DNODE_EXIT(db);
 
 			/*
 			 * Do not reference db after its lock is dropped.
 			 * Another thread may evict it.
 			 */
 			mutex_exit(&db->db_mtx);
 
 			if (evict_dbuf)
 				dnode_evict_bonus(dn);
 
 			dnode_rele(dn, db);
 		} else if (db->db_buf == NULL) {
 			/*
 			 * This is a special case: we never associated this
 			 * dbuf with any data allocated from the ARC.
 			 */
 			ASSERT(db->db_state == DB_UNCACHED ||
 			    db->db_state == DB_NOFILL);
 			dbuf_destroy(db);
 		} else if (arc_released(db->db_buf)) {
 			/*
 			 * This dbuf has anonymous data associated with it.
 			 */
 			dbuf_destroy(db);
 		} else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) ||
 		    db->db_pending_evict) {
 			dbuf_destroy(db);
 		} else if (!multilist_link_active(&db->db_cache_link)) {
 			ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
 
 			dbuf_cached_state_t dcs =
 			    dbuf_include_in_metadata_cache(db) ?
 			    DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
 			db->db_caching_status = dcs;
 
 			multilist_insert(&dbuf_caches[dcs].cache, db);
 			uint64_t db_size = db->db.db_size +
 			    dmu_buf_user_size(&db->db);
 			size = zfs_refcount_add_many(
 			    &dbuf_caches[dcs].size, db_size, db);
 			uint8_t db_level = db->db_level;
 			mutex_exit(&db->db_mtx);
 
 			if (dcs == DB_DBUF_METADATA_CACHE) {
 				DBUF_STAT_BUMP(metadata_cache_count);
 				DBUF_STAT_MAX(metadata_cache_size_bytes_max,
 				    size);
 			} else {
 				DBUF_STAT_BUMP(cache_count);
 				DBUF_STAT_MAX(cache_size_bytes_max, size);
 				DBUF_STAT_BUMP(cache_levels[db_level]);
 				DBUF_STAT_INCR(cache_levels_bytes[db_level],
 				    db_size);
 			}
 
 			if (dcs == DB_DBUF_CACHE && !evicting)
 				dbuf_evict_notify(size);
 		}
 	} else {
 		mutex_exit(&db->db_mtx);
 	}
 
 }
 
 #pragma weak dmu_buf_refcount = dbuf_refcount
 uint64_t
 dbuf_refcount(dmu_buf_impl_t *db)
 {
 	return (zfs_refcount_count(&db->db_holds));
 }
 
 uint64_t
 dmu_buf_user_refcount(dmu_buf_t *db_fake)
 {
 	uint64_t holds;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	mutex_enter(&db->db_mtx);
 	ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt);
 	holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt;
 	mutex_exit(&db->db_mtx);
 
 	return (holds);
 }
 
 void *
 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
     dmu_buf_user_t *new_user)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	mutex_enter(&db->db_mtx);
 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
 	if (db->db_user == old_user)
 		db->db_user = new_user;
 	else
 		old_user = db->db_user;
 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
 	mutex_exit(&db->db_mtx);
 
 	return (old_user);
 }
 
 void *
 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
 	return (dmu_buf_replace_user(db_fake, NULL, user));
 }
 
 void *
 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	db->db_user_immediate_evict = TRUE;
 	return (dmu_buf_set_user(db_fake, user));
 }
 
 void *
 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
 	return (dmu_buf_replace_user(db_fake, user, NULL));
 }
 
 void *
 dmu_buf_get_user(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
 	return (db->db_user);
 }
 
 uint64_t
 dmu_buf_user_size(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	if (db->db_user == NULL)
 		return (0);
 	return (atomic_load_64(&db->db_user->dbu_size));
 }
 
 void
 dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
 	ASSERT3P(db->db_user, !=, NULL);
 	ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
 	atomic_add_64(&db->db_user->dbu_size, nadd);
 }
 
 void
 dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
 	ASSERT3P(db->db_user, !=, NULL);
 	ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
 	atomic_sub_64(&db->db_user->dbu_size, nsub);
 }
 
 void
 dmu_buf_user_evict_wait(void)
 {
 	taskq_wait(dbu_evict_taskq);
 }
 
 blkptr_t *
 dmu_buf_get_blkptr(dmu_buf_t *db)
 {
 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 	return (dbi->db_blkptr);
 }
 
 objset_t *
 dmu_buf_get_objset(dmu_buf_t *db)
 {
 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 	return (dbi->db_objset);
 }
 
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
 	/* ASSERT(dmu_tx_is_syncing(tx) */
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (db->db_blkptr != NULL)
 		return;
 
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
 		BP_ZERO(db->db_blkptr);
 		return;
 	}
 	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
 		/*
 		 * This buffer was allocated at a time when there was
 		 * no available blkptrs from the dnode, or it was
 		 * inappropriate to hook it in (i.e., nlevels mismatch).
 		 */
 		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
 		ASSERT(db->db_parent == NULL);
 		db->db_parent = dn->dn_dbuf;
 		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
 		DBUF_VERIFY(db);
 	} else {
 		dmu_buf_impl_t *parent = db->db_parent;
 		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 		ASSERT(dn->dn_phys->dn_nlevels > 1);
 		if (parent == NULL) {
 			mutex_exit(&db->db_mtx);
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			parent = dbuf_hold_level(dn, db->db_level + 1,
 			    db->db_blkid >> epbs, db);
 			rw_exit(&dn->dn_struct_rwlock);
 			mutex_enter(&db->db_mtx);
 			db->db_parent = parent;
 		}
 		db->db_blkptr = (blkptr_t *)parent->db.db_data +
 		    (db->db_blkid & ((1ULL << epbs) - 1));
 		DBUF_VERIFY(db);
 	}
 }
 
 static void
 dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	void *data = dr->dt.dl.dr_data;
 
 	ASSERT0(db->db_level);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_blkid == DMU_BONUS_BLKID);
 	ASSERT(data != NULL);
 
 	dnode_t *dn = dr->dr_dnode;
 	ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
 	    DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
 	memcpy(DN_BONUS(dn->dn_phys), data, DN_MAX_BONUS_LEN(dn->dn_phys));
 
 	dbuf_sync_leaf_verify_bonus_dnode(dr);
 
 	dbuf_undirty_bonus(dr);
 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
 }
 
 /*
  * When syncing out a blocks of dnodes, adjust the block to deal with
  * encryption.  Normally, we make sure the block is decrypted before writing
  * it.  If we have crypt params, then we are writing a raw (encrypted) block,
  * from a raw receive.  In this case, set the ARC buf's crypt params so
  * that the BP will be filled with the correct byteorder, salt, iv, and mac.
  */
 static void
 dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr)
 {
 	int err;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
 	ASSERT3U(db->db_level, ==, 0);
 
 	if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) {
 		zbookmark_phys_t zb;
 
 		/*
 		 * Unfortunately, there is currently no mechanism for
 		 * syncing context to handle decryption errors. An error
 		 * here is only possible if an attacker maliciously
 		 * changed a dnode block and updated the associated
 		 * checksums going up the block tree.
 		 */
 		SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
 		    db->db.db_object, db->db_level, db->db_blkid);
 		err = arc_untransform(db->db_buf, db->db_objset->os_spa,
 		    &zb, B_TRUE);
 		if (err)
 			panic("Invalid dnode block MAC");
 	} else if (dr->dt.dl.dr_has_raw_params) {
 		(void) arc_release(dr->dt.dl.dr_data, db);
 		arc_convert_to_raw(dr->dt.dl.dr_data,
 		    dmu_objset_id(db->db_objset),
 		    dr->dt.dl.dr_byteorder, DMU_OT_DNODE,
 		    dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac);
 	}
 }
 
 /*
  * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
  * is critical the we not allow the compiler to inline this function in to
  * dbuf_sync_list() thereby drastically bloating the stack usage.
  */
 noinline static void
 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn = dr->dr_dnode;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
 
 	mutex_enter(&db->db_mtx);
 
 	ASSERT(db->db_level > 0);
 	DBUF_VERIFY(db);
 
 	/* Read the block if it hasn't been read yet. */
 	if (db->db_buf == NULL) {
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 		mutex_enter(&db->db_mtx);
 	}
 	ASSERT3U(db->db_state, ==, DB_CACHED);
 	ASSERT(db->db_buf != NULL);
 
 	/* Indirect block size must match what the dnode thinks it is. */
 	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 	dbuf_check_blkptr(dn, db);
 
 	/* Provide the pending dirty record to child dbufs */
 	db->db_data_pending = dr;
 
 	mutex_exit(&db->db_mtx);
 
 	dbuf_write(dr, db->db_buf, tx);
 
 	zio_t *zio = dr->dr_zio;
 	mutex_enter(&dr->dt.di.dr_mtx);
 	dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
 	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 	mutex_exit(&dr->dt.di.dr_mtx);
 	zio_nowait(zio);
 }
 
 /*
  * Verify that the size of the data in our bonus buffer does not exceed
  * its recorded size.
  *
  * The purpose of this verification is to catch any cases in development
  * where the size of a phys structure (i.e space_map_phys_t) grows and,
  * due to incorrect feature management, older pools expect to read more
  * data even though they didn't actually write it to begin with.
  *
  * For a example, this would catch an error in the feature logic where we
  * open an older pool and we expect to write the space map histogram of
  * a space map with size SPACE_MAP_SIZE_V0.
  */
 static void
 dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)
 {
 #ifdef ZFS_DEBUG
 	dnode_t *dn = dr->dr_dnode;
 
 	/*
 	 * Encrypted bonus buffers can have data past their bonuslen.
 	 * Skip the verification of these blocks.
 	 */
 	if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))
 		return;
 
 	uint16_t bonuslen = dn->dn_phys->dn_bonuslen;
 	uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
 	ASSERT3U(bonuslen, <=, maxbonuslen);
 
 	arc_buf_t *datap = dr->dt.dl.dr_data;
 	char *datap_end = ((char *)datap) + bonuslen;
 	char *datap_max = ((char *)datap) + maxbonuslen;
 
 	/* ensure that everything is zero after our data */
 	for (; datap_end < datap_max; datap_end++)
 		ASSERT(*datap_end == 0);
 #endif
 }
 
 static blkptr_t *
 dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
 {
 	/* This must be a lightweight dirty record. */
 	ASSERT3P(dr->dr_dbuf, ==, NULL);
 	dnode_t *dn = dr->dr_dnode;
 
 	if (dn->dn_phys->dn_nlevels == 1) {
 		VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr);
 		return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
 	} else {
 		dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
 		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 		VERIFY3U(parent_db->db_level, ==, 1);
 		VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn);
 		VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
 		blkptr_t *bp = parent_db->db.db_data;
 		return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
 	}
 }
 
 static void
 dbuf_lightweight_ready(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_error != 0)
 		return;
 
 	dnode_t *dn = dr->dr_dnode;
 
 	blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
 	spa_t *spa = dmu_objset_spa(dn->dn_objset);
 	int64_t delta = bp_get_dsize_sync(spa, bp) -
 	    bp_get_dsize_sync(spa, bp_orig);
 	dnode_diduse_space(dn, delta);
 
 	uint64_t blkid = dr->dt.dll.dr_blkid;
 	mutex_enter(&dn->dn_mtx);
 	if (blkid > dn->dn_phys->dn_maxblkid) {
 		ASSERT0(dn->dn_objset->os_raw_receive);
 		dn->dn_phys->dn_maxblkid = blkid;
 	}
 	mutex_exit(&dn->dn_mtx);
 
 	if (!BP_IS_EMBEDDED(bp)) {
 		uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1;
 		BP_SET_FILL(bp, fill);
 	}
 
 	dmu_buf_impl_t *parent_db;
 	EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
 	if (dr->dr_parent == NULL) {
 		parent_db = dn->dn_dbuf;
 	} else {
 		parent_db = dr->dr_parent->dr_dbuf;
 	}
 	rw_enter(&parent_db->db_rwlock, RW_WRITER);
 	*bp_orig = *bp;
 	rw_exit(&parent_db->db_rwlock);
 }
 
 static void
 dbuf_lightweight_done(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 
 	VERIFY0(zio->io_error);
 
 	objset_t *os = dr->dr_dnode->dn_objset;
 	dmu_tx_t *tx = os->os_synctx;
 
 	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
 		ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
 	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		(void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, zio->io_bp, tx);
 	}
 
 	dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
 	    zio->io_txg);
 
 	abd_free(dr->dt.dll.dr_abd);
 	kmem_free(dr, sizeof (*dr));
 }
 
 noinline static void
 dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	dnode_t *dn = dr->dr_dnode;
 	zio_t *pio;
 	if (dn->dn_phys->dn_nlevels == 1) {
 		pio = dn->dn_zio;
 	} else {
 		pio = dr->dr_parent->dr_zio;
 	}
 
 	zbookmark_phys_t zb = {
 		.zb_objset = dmu_objset_id(dn->dn_objset),
 		.zb_object = dn->dn_object,
 		.zb_level = 0,
 		.zb_blkid = dr->dt.dll.dr_blkid,
 	};
 
 	/*
 	 * See comment in dbuf_write().  This is so that zio->io_bp_orig
 	 * will have the old BP in dbuf_lightweight_done().
 	 */
 	dr->dr_bp_copy = *dbuf_lightweight_bp(dr);
 
 	dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
 	    dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
 	    dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
 	    &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
 	    dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
 	    ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
 
 	zio_nowait(dr->dr_zio);
 }
 
 /*
  * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
  * critical the we not allow the compiler to inline this function in to
  * dbuf_sync_list() thereby drastically bloating the stack usage.
  */
 noinline static void
 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	arc_buf_t **datap = &dr->dt.dl.dr_data;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn = dr->dr_dnode;
 	objset_t *os;
 	uint64_t txg = tx->tx_txg;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
 
 	mutex_enter(&db->db_mtx);
 	/*
 	 * To be synced, we must be dirtied.  But we
 	 * might have been freed after the dirty.
 	 */
 	if (db->db_state == DB_UNCACHED) {
 		/* This buffer has been freed since it was dirtied */
 		ASSERT(db->db.db_data == NULL);
 	} else if (db->db_state == DB_FILL) {
 		/* This buffer was freed and is now being re-filled */
 		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
 	} else if (db->db_state == DB_READ) {
 		/*
 		 * This buffer has a clone we need to write, and an in-flight
 		 * read on the BP we're about to clone. Its safe to issue the
 		 * write here because the read has already been issued and the
 		 * contents won't change.
 		 */
 		ASSERT(dr->dt.dl.dr_brtwrite &&
 		    dr->dt.dl.dr_override_state == DR_OVERRIDDEN);
 	} else {
 		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
 	}
 	DBUF_VERIFY(db);
 
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
 			/*
 			 * In the previous transaction group, the bonus buffer
 			 * was entirely used to store the attributes for the
 			 * dnode which overrode the dn_spill field.  However,
 			 * when adding more attributes to the file a spill
 			 * block was required to hold the extra attributes.
 			 *
 			 * Make sure to clear the garbage left in the dn_spill
 			 * field from the previous attributes in the bonus
 			 * buffer.  Otherwise, after writing out the spill
 			 * block to the new allocated dva, it will free
 			 * the old block pointed to by the invalid dn_spill.
 			 */
 			db->db_blkptr = NULL;
 		}
 		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	/*
 	 * If this is a bonus buffer, simply copy the bonus data into the
 	 * dnode.  It will be written out when the dnode is synced (and it
 	 * will be synced, since it must have been dirty for dbuf_sync to
 	 * be called).
 	 */
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		ASSERT(dr->dr_dbuf == db);
 		dbuf_sync_bonus(dr, tx);
 		return;
 	}
 
 	os = dn->dn_objset;
 
 	/*
 	 * This function may have dropped the db_mtx lock allowing a dmu_sync
 	 * operation to sneak in. As a result, we need to ensure that we
 	 * don't check the dr_override_state until we have returned from
 	 * dbuf_check_blkptr.
 	 */
 	dbuf_check_blkptr(dn, db);
 
 	/*
 	 * If this buffer is in the middle of an immediate write,
 	 * wait for the synchronous IO to complete.
 	 */
 	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 		cv_wait(&db->db_changed, &db->db_mtx);
 	}
 
 	/*
 	 * If this is a dnode block, ensure it is appropriately encrypted
 	 * or decrypted, depending on what we are writing to it this txg.
 	 */
 	if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
 		dbuf_prepare_encrypted_dnode_leaf(dr);
 
 	if (db->db_state != DB_NOFILL &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    zfs_refcount_count(&db->db_holds) > 1 &&
 	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
 	    *datap == db->db_buf) {
 		/*
 		 * If this buffer is currently "in use" (i.e., there
 		 * are active holds and db_data still references it),
 		 * then make a copy before we start the write so that
 		 * any modifications from the open txg will not leak
 		 * into this write.
 		 *
 		 * NOTE: this copy does not need to be made for
 		 * objects only modified in the syncing context (e.g.
 		 * DNONE_DNODE blocks).
 		 */
 		int psize = arc_buf_size(*datap);
 		int lsize = arc_buf_lsize(*datap);
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		enum zio_compress compress_type = arc_get_compression(*datap);
 		uint8_t complevel = arc_get_complevel(*datap);
 
 		if (arc_is_encrypted(*datap)) {
 			boolean_t byteorder;
 			uint8_t salt[ZIO_DATA_SALT_LEN];
 			uint8_t iv[ZIO_DATA_IV_LEN];
 			uint8_t mac[ZIO_DATA_MAC_LEN];
 
 			arc_get_raw_params(*datap, &byteorder, salt, iv, mac);
 			*datap = arc_alloc_raw_buf(os->os_spa, db,
 			    dmu_objset_id(os), byteorder, salt, iv, mac,
 			    dn->dn_type, psize, lsize, compress_type,
 			    complevel);
 		} else if (compress_type != ZIO_COMPRESS_OFF) {
 			ASSERT3U(type, ==, ARC_BUFC_DATA);
 			*datap = arc_alloc_compressed_buf(os->os_spa, db,
 			    psize, lsize, compress_type, complevel);
 		} else {
 			*datap = arc_alloc_buf(os->os_spa, db, type, psize);
 		}
 		memcpy((*datap)->b_data, db->db.db_data, psize);
 	}
 	db->db_data_pending = dr;
 
 	mutex_exit(&db->db_mtx);
 
 	dbuf_write(dr, *datap, tx);
 
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
 		list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr);
 	} else {
 		zio_nowait(dr->dr_zio);
 	}
 }
 
 /*
  * Syncs out a range of dirty records for indirect or leaf dbufs.  May be
  * called recursively from dbuf_sync_indirect().
  */
 void
 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr;
 
 	while ((dr = list_head(list))) {
 		if (dr->dr_zio != NULL) {
 			/*
 			 * If we find an already initialized zio then we
 			 * are processing the meta-dnode, and we have finished.
 			 * The dbufs for all dnodes are put back on the list
 			 * during processing, so that we can zio_wait()
 			 * these IOs after initiating all child IOs.
 			 */
 			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
 			    DMU_META_DNODE_OBJECT);
 			break;
 		}
 		list_remove(list, dr);
 		if (dr->dr_dbuf == NULL) {
 			dbuf_sync_lightweight(dr, tx);
 		} else {
 			if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
 			    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
 				VERIFY3U(dr->dr_dbuf->db_level, ==, level);
 			}
 			if (dr->dr_dbuf->db_level > 0)
 				dbuf_sync_indirect(dr, tx);
 			else
 				dbuf_sync_leaf(dr, tx);
 		}
 	}
 }
 
 static void
 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	(void) buf;
 	dmu_buf_impl_t *db = vdb;
 	dnode_t *dn;
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	spa_t *spa = zio->io_spa;
 	int64_t delta;
 	uint64_t fill = 0;
 	int i;
 
 	ASSERT3P(db->db_blkptr, !=, NULL);
 	ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
 	zio->io_prev_space_delta = delta;
 
-	if (bp->blk_birth != 0) {
+	if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
 		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_type) ||
 		    (db->db_blkid == DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
 		    BP_IS_EMBEDDED(bp));
 		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
 	}
 
 	mutex_enter(&db->db_mtx);
 
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
 		ASSERT(!(BP_IS_HOLE(bp)) &&
 		    db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
 	}
 #endif
 
 	if (db->db_level == 0) {
 		mutex_enter(&dn->dn_mtx);
 		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
 		    db->db_blkid != DMU_SPILL_BLKID) {
 			ASSERT0(db->db_objset->os_raw_receive);
 			dn->dn_phys->dn_maxblkid = db->db_blkid;
 		}
 		mutex_exit(&dn->dn_mtx);
 
 		if (dn->dn_type == DMU_OT_DNODE) {
 			i = 0;
 			while (i < db->db.db_size) {
 				dnode_phys_t *dnp =
 				    (void *)(((char *)db->db.db_data) + i);
 
 				i += DNODE_MIN_SIZE;
 				if (dnp->dn_type != DMU_OT_NONE) {
 					fill++;
 					for (int j = 0; j < dnp->dn_nblkptr;
 					    j++) {
 						(void) zfs_blkptr_verify(spa,
 						    &dnp->dn_blkptr[j],
 						    BLK_CONFIG_SKIP,
 						    BLK_VERIFY_HALT);
 					}
 					if (dnp->dn_flags &
 					    DNODE_FLAG_SPILL_BLKPTR) {
 						(void) zfs_blkptr_verify(spa,
 						    DN_SPILL_BLKPTR(dnp),
 						    BLK_CONFIG_SKIP,
 						    BLK_VERIFY_HALT);
 					}
 					i += dnp->dn_extra_slots *
 					    DNODE_MIN_SIZE;
 				}
 			}
 		} else {
 			if (BP_IS_HOLE(bp)) {
 				fill = 0;
 			} else {
 				fill = 1;
 			}
 		}
 	} else {
 		blkptr_t *ibp = db->db.db_data;
 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
 			if (BP_IS_HOLE(ibp))
 				continue;
 			(void) zfs_blkptr_verify(spa, ibp,
 			    BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
 			fill += BP_GET_FILL(ibp);
 		}
 	}
 	DB_DNODE_EXIT(db);
 
 	if (!BP_IS_EMBEDDED(bp))
 		BP_SET_FILL(bp, fill);
 
 	mutex_exit(&db->db_mtx);
 
 	db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG);
 	*db->db_blkptr = *bp;
 	dmu_buf_unlock_parent(db, dblt, FTAG);
 }
 
 /*
  * This function gets called just prior to running through the compression
  * stage of the zio pipeline. If we're an indirect block comprised of only
  * holes, then we want this indirect to be compressed away to a hole. In
  * order to do that we must zero out any information about the holes that
  * this indirect points to prior to before we try to compress it.
  */
 static void
 dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	(void) zio, (void) buf;
 	dmu_buf_impl_t *db = vdb;
 	dnode_t *dn;
 	blkptr_t *bp;
 	unsigned int epbs, i;
 
 	ASSERT3U(db->db_level, >, 0);
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	ASSERT3U(epbs, <, 31);
 
 	/* Determine if all our children are holes */
 	for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
 		if (!BP_IS_HOLE(bp))
 			break;
 	}
 
 	/*
 	 * If all the children are holes, then zero them all out so that
 	 * we may get compressed away.
 	 */
 	if (i == 1ULL << epbs) {
 		/*
 		 * We only found holes. Grab the rwlock to prevent
 		 * anybody from reading the blocks we're about to
 		 * zero out.
 		 */
 		rw_enter(&db->db_rwlock, RW_WRITER);
 		memset(db->db.db_data, 0, db->db.db_size);
 		rw_exit(&db->db_rwlock);
 	}
 	DB_DNODE_EXIT(db);
 }
 
 static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	(void) buf;
 	dmu_buf_impl_t *db = vdb;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	blkptr_t *bp = db->db_blkptr;
 	objset_t *os = db->db_objset;
 	dmu_tx_t *tx = os->os_synctx;
 
 	ASSERT0(zio->io_error);
 	ASSERT(db->db_blkptr == bp);
 
 	/*
 	 * For nopwrites and rewrites we ensure that the bp matches our
 	 * original and bypass all the accounting.
 	 */
 	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
 		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, bp, tx);
 	}
 
 	mutex_enter(&db->db_mtx);
 
 	DBUF_VERIFY(db);
 
 	dbuf_dirty_record_t *dr = db->db_data_pending;
 	dnode_t *dn = dr->dr_dnode;
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	ASSERT(dr->dr_dbuf == db);
 	ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
 	list_remove(&db->db_dirty_records, dr);
 
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
 		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
 		    db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
 	}
 #endif
 
 	if (db->db_level == 0) {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 		if (db->db_state != DB_NOFILL) {
 			if (dr->dt.dl.dr_data != NULL &&
 			    dr->dt.dl.dr_data != db->db_buf) {
 				arc_buf_destroy(dr->dt.dl.dr_data, db);
 			}
 		}
 	} else {
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
 		if (!BP_IS_HOLE(db->db_blkptr)) {
 			int epbs __maybe_unused = dn->dn_phys->dn_indblkshift -
 			    SPA_BLKPTRSHIFT;
 			ASSERT3U(db->db_blkid, <=,
 			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
 			    db->db.db_size);
 		}
 		mutex_destroy(&dr->dt.di.dr_mtx);
 		list_destroy(&dr->dt.di.dr_children);
 	}
 
 	cv_broadcast(&db->db_changed);
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 	db->db_data_pending = NULL;
 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
 
 	dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
 	    zio->io_txg);
 
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 }
 
 static void
 dbuf_write_nofill_ready(zio_t *zio)
 {
 	dbuf_write_ready(zio, NULL, zio->io_private);
 }
 
 static void
 dbuf_write_nofill_done(zio_t *zio)
 {
 	dbuf_write_done(zio, NULL, zio->io_private);
 }
 
 static void
 dbuf_write_override_ready(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	dbuf_write_ready(zio, NULL, db);
 }
 
 static void
 dbuf_write_override_done(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
 
 	mutex_enter(&db->db_mtx);
 	if (!BP_EQUAL(zio->io_bp, obp)) {
 		if (!BP_IS_HOLE(obp))
 			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
 		arc_release(dr->dt.dl.dr_data, db);
 	}
 	mutex_exit(&db->db_mtx);
 
 	dbuf_write_done(zio, NULL, db);
 
 	if (zio->io_abd != NULL)
 		abd_free(zio->io_abd);
 }
 
 typedef struct dbuf_remap_impl_callback_arg {
 	objset_t	*drica_os;
 	uint64_t	drica_blk_birth;
 	dmu_tx_t	*drica_tx;
 } dbuf_remap_impl_callback_arg_t;
 
 static void
 dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
     void *arg)
 {
 	dbuf_remap_impl_callback_arg_t *drica = arg;
 	objset_t *os = drica->drica_os;
 	spa_t *spa = dmu_objset_spa(os);
 	dmu_tx_t *tx = drica->drica_tx;
 
 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
 	if (os == spa_meta_objset(spa)) {
 		spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
 	} else {
 		dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
 		    size, drica->drica_blk_birth, tx);
 	}
 }
 
 static void
 dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
 {
 	blkptr_t bp_copy = *bp;
 	spa_t *spa = dmu_objset_spa(dn->dn_objset);
 	dbuf_remap_impl_callback_arg_t drica;
 
 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
 	drica.drica_os = dn->dn_objset;
-	drica.drica_blk_birth = bp->blk_birth;
+	drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp);
 	drica.drica_tx = tx;
 	if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
 	    &drica)) {
 		/*
 		 * If the blkptr being remapped is tracked by a livelist,
 		 * then we need to make sure the livelist reflects the update.
 		 * First, cancel out the old blkptr by appending a 'FREE'
 		 * entry. Next, add an 'ALLOC' to track the new version. This
 		 * way we avoid trying to free an inaccurate blkptr at delete.
 		 * Note that embedded blkptrs are not tracked in livelists.
 		 */
 		if (dn->dn_objset != spa_meta_objset(spa)) {
 			dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
 			if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-			    bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+			    BP_GET_LOGICAL_BIRTH(bp) >
+			    ds->ds_dir->dd_origin_txg) {
 				ASSERT(!BP_IS_EMBEDDED(bp));
 				ASSERT(dsl_dir_is_clone(ds->ds_dir));
 				ASSERT(spa_feature_is_enabled(spa,
 				    SPA_FEATURE_LIVELIST));
 				bplist_append(&ds->ds_dir->dd_pending_frees,
 				    bp);
 				bplist_append(&ds->ds_dir->dd_pending_allocs,
 				    &bp_copy);
 			}
 		}
 
 		/*
 		 * The db_rwlock prevents dbuf_read_impl() from
 		 * dereferencing the BP while we are changing it.  To
 		 * avoid lock contention, only grab it when we are actually
 		 * changing the BP.
 		 */
 		if (rw != NULL)
 			rw_enter(rw, RW_WRITER);
 		*bp = bp_copy;
 		if (rw != NULL)
 			rw_exit(rw);
 	}
 }
 
 /*
  * Remap any existing BP's to concrete vdevs, if possible.
  */
 static void
 dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_objset_spa(db->db_objset);
 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
 		return;
 
 	if (db->db_level > 0) {
 		blkptr_t *bp = db->db.db_data;
 		for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
 			dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx);
 		}
 	} else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 		dnode_phys_t *dnp = db->db.db_data;
 		ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==,
 		    DMU_OT_DNODE);
 		for (int i = 0; i < db->db.db_size >> DNODE_SHIFT;
 		    i += dnp[i].dn_extra_slots + 1) {
 			for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
 				krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL :
 				    &dn->dn_dbuf->db_rwlock);
 				dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock,
 				    tx);
 			}
 		}
 	}
 }
 
 
 /*
  * Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
  * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
  */
 static void
 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn = dr->dr_dnode;
 	objset_t *os;
 	dmu_buf_impl_t *parent = db->db_parent;
 	uint64_t txg = tx->tx_txg;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *pio; /* parent I/O */
 	int wp_flag = 0;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	os = dn->dn_objset;
 
 	if (db->db_state != DB_NOFILL) {
 		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
 			/*
 			 * Private object buffers are released here rather
 			 * than in dbuf_dirty() since they are only modified
 			 * in the syncing context and we don't want the
 			 * overhead of making multiple copies of the data.
 			 */
 			if (BP_IS_HOLE(db->db_blkptr)) {
 				arc_buf_thaw(data);
 			} else {
 				dbuf_release_bp(db);
 			}
 			dbuf_remap(dn, db, tx);
 		}
 	}
 
 	if (parent != dn->dn_dbuf) {
 		/* Our parent is an indirect block. */
 		/* We have a dirty parent that has been scheduled for write. */
 		ASSERT(parent && parent->db_data_pending);
 		/* Our parent's buffer is one level closer to the dnode. */
 		ASSERT(db->db_level == parent->db_level-1);
 		/*
 		 * We're about to modify our parent's db_data by modifying
 		 * our block pointer, so the parent must be released.
 		 */
 		ASSERT(arc_released(parent->db_buf));
 		pio = parent->db_data_pending->dr_zio;
 	} else {
 		/* Our parent is the dnode itself. */
 		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
 		    db->db_blkid != DMU_SPILL_BLKID) ||
 		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
 		if (db->db_blkid != DMU_SPILL_BLKID)
 			ASSERT3P(db->db_blkptr, ==,
 			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
 		pio = dn->dn_zio;
 	}
 
 	ASSERT(db->db_level == 0 || data == db->db_buf);
-	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg);
 	ASSERT(pio);
 
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		wp_flag = WP_SPILL;
 	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
 
 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
 
 	/*
 	 * We copy the blkptr now (rather than when we instantiate the dirty
 	 * record), because its value can change between open context and
 	 * syncing context. We do not need to hold dn_struct_rwlock to read
 	 * db_blkptr because we are in syncing context.
 	 */
 	dr->dr_bp_copy = *db->db_blkptr;
 
 	if (db->db_level == 0 &&
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
 		 * The BP for this block has been provided by open context
 		 * (by dmu_sync() or dmu_buf_write_embedded()).
 		 */
 		abd_t *contents = (data != NULL) ?
 		    abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
 
 		dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
 		    contents, db->db.db_size, db->db.db_size, &zp,
 		    dbuf_write_override_ready, NULL,
 		    dbuf_write_override_done,
 		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 		mutex_enter(&db->db_mtx);
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
 		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
 		    dr->dt.dl.dr_brtwrite);
 		mutex_exit(&db->db_mtx);
 	} else if (db->db_state == DB_NOFILL) {
 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
 		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(pio, os->os_spa, txg,
 		    &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
 		    dbuf_write_nofill_ready, NULL,
 		    dbuf_write_nofill_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE,
 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
 	} else {
 		ASSERT(arc_released(data));
 
 		/*
 		 * For indirect blocks, we want to setup the children
 		 * ready callback so that we can properly handle an indirect
 		 * block that only contains holes.
 		 */
 		arc_write_done_func_t *children_ready_cb = NULL;
 		if (db->db_level != 0)
 			children_ready_cb = dbuf_write_children_ready;
 
 		dr->dr_zio = arc_write(pio, os->os_spa, txg,
 		    &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
 		    dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
 		    children_ready_cb, dbuf_write_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 	}
 }
 
 EXPORT_SYMBOL(dbuf_find);
 EXPORT_SYMBOL(dbuf_is_metadata);
 EXPORT_SYMBOL(dbuf_destroy);
 EXPORT_SYMBOL(dbuf_loan_arcbuf);
 EXPORT_SYMBOL(dbuf_whichblock);
 EXPORT_SYMBOL(dbuf_read);
 EXPORT_SYMBOL(dbuf_unoverride);
 EXPORT_SYMBOL(dbuf_free_range);
 EXPORT_SYMBOL(dbuf_new_size);
 EXPORT_SYMBOL(dbuf_release_bp);
 EXPORT_SYMBOL(dbuf_dirty);
 EXPORT_SYMBOL(dmu_buf_set_crypt_params);
 EXPORT_SYMBOL(dmu_buf_will_dirty);
 EXPORT_SYMBOL(dmu_buf_is_dirty);
 EXPORT_SYMBOL(dmu_buf_will_clone);
 EXPORT_SYMBOL(dmu_buf_will_not_fill);
 EXPORT_SYMBOL(dmu_buf_will_fill);
 EXPORT_SYMBOL(dmu_buf_fill_done);
 EXPORT_SYMBOL(dmu_buf_rele);
 EXPORT_SYMBOL(dbuf_assign_arcbuf);
 EXPORT_SYMBOL(dbuf_prefetch);
 EXPORT_SYMBOL(dbuf_hold_impl);
 EXPORT_SYMBOL(dbuf_hold);
 EXPORT_SYMBOL(dbuf_hold_level);
 EXPORT_SYMBOL(dbuf_create_bonus);
 EXPORT_SYMBOL(dbuf_spill_set_blksz);
 EXPORT_SYMBOL(dbuf_rm_spill);
 EXPORT_SYMBOL(dbuf_add_ref);
 EXPORT_SYMBOL(dbuf_rele);
 EXPORT_SYMBOL(dbuf_rele_and_unlock);
 EXPORT_SYMBOL(dbuf_refcount);
 EXPORT_SYMBOL(dbuf_sync_list);
 EXPORT_SYMBOL(dmu_buf_set_user);
 EXPORT_SYMBOL(dmu_buf_set_user_ie);
 EXPORT_SYMBOL(dmu_buf_get_user);
 EXPORT_SYMBOL(dmu_buf_get_blkptr);
 
 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW,
 	"Maximum size in bytes of the dbuf cache.");
 
 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
 	"Percentage over dbuf_cache_max_bytes for direct dbuf eviction.");
 
 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,
 	"Percentage below dbuf_cache_max_bytes when dbuf eviction stops.");
 
 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW,
 	"Maximum size in bytes of dbuf metadata cache.");
 
 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW,
 	"Set size of dbuf cache to log2 fraction of arc size.");
 
 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,
 	"Set size of dbuf metadata cache to log2 fraction of arc size.");
 
 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
 	"Set size of dbuf cache mutex array as log2 shift.");
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index de8640e58a2c..4c53cb0a2f9b 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -1,1170 +1,1170 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2022 by Pawel Jakub Dawidek
  * Copyright (c) 2023, Klara Inc.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/ddt.h>
 #include <sys/ddt_impl.h>
 #include <sys/zap.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/dsl_pool.h>
 #include <sys/zio_checksum.h>
 #include <sys/dsl_scan.h>
 #include <sys/abd.h>
 
 /*
  * # DDT: Deduplication tables
  *
  * The dedup subsystem provides block-level deduplication. When enabled, blocks
  * to be written will have the dedup (D) bit set, which causes them to be
  * tracked in a "dedup table", or DDT. If a block has been seen before (exists
  * in the DDT), instead of being written, it will instead be made to reference
  * the existing on-disk data, and a refcount bumped in the DDT instead.
  *
  * ## Dedup tables and entries
  *
  * Conceptually, a DDT is a dictionary or map. Each entry has a "key"
  * (ddt_key_t) made up a block's checksum and certian properties, and a "value"
  * (one or more ddt_phys_t) containing valid DVAs for the block's data, birth
  * time and refcount. Together these are enough to track references to a
  * specific block, to build a valid block pointer to reference that block (for
  * freeing, scrubbing, etc), and to fill a new block pointer with the missing
  * pieces to make it seem like it was written.
  *
  * There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[].
  * Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk
  * object data formats, each with their own implementations) and "classes"
  * (ddt_class_t, instance of a storage type object, for entries with a specific
  * characteristic). An entry (key) will only ever exist on one of these objects
  * at any given time, but may be moved from one to another if their type or
  * class changes.
  *
  * The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block
  * is to be written, before DVAs have been allocated, ddt_lookup() is called to
  * see if the block has been seen before. If its not found, the write proceeds
  * as normal, and after it succeeds, a new entry is created. If it is found, we
  * fill the BP with the DVAs from the entry, increment the refcount and cause
  * the write IO to return immediately.
  *
  * Each ddt_phys_t slot in the entry represents a separate dedup block for the
  * same content/checksum. The slot is selected based on the zp_copies parameter
  * the block is written with, that is, the number of DVAs in the block. The
  * "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto"
  * feature. These are no longer written, and will be freed if encountered on
  * old pools.
  *
  * ## Lifetime of an entry
  *
  * A DDT can be enormous, and typically is not held in memory all at once.
  * Instead, the changes to an entry are tracked in memory, and written down to
  * disk at the end of each txg.
  *
  * A "live" in-memory entry (ddt_entry_t) is a node on the live tree
  * (ddt_tree).  At the start of a txg, ddt_tree is empty. When an entry is
  * required for IO, ddt_lookup() is called. If an entry already exists on
  * ddt_tree, it is returned. Otherwise, a new one is created, and the
  * type/class objects for the DDT are searched for that key. If its found, its
  * value is copied into the live entry. If not, an empty entry is created.
  *
  * The live entry will be modified during the txg, usually by modifying the
  * refcount, but sometimes by adding or updating DVAs. At the end of the txg
  * (during spa_sync()), type and class are recalculated for entry (see
  * ddt_sync_entry()), and the entry is written to the appropriate storage
  * object and (if necessary), removed from an old one. ddt_tree is cleared and
  * the next txg can start.
  *
  * ## Repair IO
  *
  * If a read on a dedup block fails, but there are other copies of the block in
  * the other ddt_phys_t slots, reads will be issued for those instead
  * (zio_ddt_read_start()). If one of those succeeds, the read is returned to
  * the caller, and a copy is stashed on the entry's dde_repair_abd.
  *
  * During the end-of-txg sync, any entries with a dde_repair_abd get a
  * "rewrite" write issued for the original block pointer, with the data read
  * from the alternate block. If the block is actually damaged, this will invoke
  * the pool's "self-healing" mechanism, and repair the block.
  *
  * ## Scanning (scrub/resilver)
  *
  * If dedup is active, the scrub machinery will walk the dedup table first, and
  * scrub all blocks with refcnt > 1 first. After that it will move on to the
  * regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them.
  * In this way, heavily deduplicated blocks are only scrubbed once. See the
  * commentary on dsl_scan_ddt() for more details.
  *
  * Walking the DDT is done via ddt_walk(). The current position is stored in a
  * ddt_bookmark_t, which represents a stable position in the storage object.
  * This bookmark is stored by the scan machinery, and must reference the same
  * position on the object even if the object changes, the pool is exported, or
  * OpenZFS is upgraded.
  *
  * ## Interaction with block cloning
  *
  * If block cloning and dedup are both enabled on a pool, BRT will look for the
  * dedup bit on an incoming block pointer. If set, it will call into the DDT
  * (ddt_addref()) to add a reference to the block, instead of adding a
  * reference to the BRT. See brt_pending_apply().
  */
 
 /*
  * These are the only checksums valid for dedup. They must match the list
  * from dedup_table in zfs_prop.c
  */
 #define	DDT_CHECKSUM_VALID(c)	\
 	(c == ZIO_CHECKSUM_SHA256 || c == ZIO_CHECKSUM_SHA512 || \
 	c == ZIO_CHECKSUM_SKEIN || c == ZIO_CHECKSUM_EDONR || \
 	c == ZIO_CHECKSUM_BLAKE3)
 
 static kmem_cache_t *ddt_cache;
 static kmem_cache_t *ddt_entry_cache;
 
 /*
  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
  */
 int zfs_dedup_prefetch = 0;
 
 static const ddt_ops_t *const ddt_ops[DDT_TYPES] = {
 	&ddt_zap_ops,
 };
 
 static const char *const ddt_class_name[DDT_CLASSES] = {
 	"ditto",
 	"duplicate",
 	"unique",
 };
 
 static void
 ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     dmu_tx_t *tx)
 {
 	spa_t *spa = ddt->ddt_spa;
 	objset_t *os = ddt->ddt_os;
 	uint64_t *objectp = &ddt->ddt_object[type][class];
 	boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
 	    ZCHECKSUM_FLAG_DEDUP;
 	char name[DDT_NAMELEN];
 
 	ddt_object_name(ddt, type, class, name);
 
 	ASSERT3U(*objectp, ==, 0);
 	VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash));
 	ASSERT3U(*objectp, !=, 0);
 
 	VERIFY0(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
 	    sizeof (uint64_t), 1, objectp, tx));
 
 	VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name,
 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 	    &ddt->ddt_histogram[type][class], tx));
 }
 
 static void
 ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     dmu_tx_t *tx)
 {
 	spa_t *spa = ddt->ddt_spa;
 	objset_t *os = ddt->ddt_os;
 	uint64_t *objectp = &ddt->ddt_object[type][class];
 	uint64_t count;
 	char name[DDT_NAMELEN];
 
 	ddt_object_name(ddt, type, class, name);
 
 	ASSERT3U(*objectp, !=, 0);
 	ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
 	VERIFY0(ddt_object_count(ddt, type, class, &count));
 	VERIFY0(count);
 	VERIFY0(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx));
 	VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx));
 	VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx));
 	memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t));
 
 	*objectp = 0;
 }
 
 static int
 ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
 	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 	dmu_object_info_t doi;
 	uint64_t count;
 	char name[DDT_NAMELEN];
 	int error;
 
 	ddt_object_name(ddt, type, class, name);
 
 	error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
 	    sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
 	if (error != 0)
 		return (error);
 
 	error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 	    &ddt->ddt_histogram[type][class]);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Seed the cached statistics.
 	 */
 	error = ddt_object_info(ddt, type, class, &doi);
 	if (error)
 		return (error);
 
 	error = ddt_object_count(ddt, type, class, &count);
 	if (error)
 		return (error);
 
 	ddo->ddo_count = count;
 	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 
 	return (0);
 }
 
 static void
 ddt_object_sync(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     dmu_tx_t *tx)
 {
 	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 	dmu_object_info_t doi;
 	uint64_t count;
 	char name[DDT_NAMELEN];
 
 	ddt_object_name(ddt, type, class, name);
 
 	VERIFY0(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 	    &ddt->ddt_histogram[type][class], tx));
 
 	/*
 	 * Cache DDT statistics; this is the only time they'll change.
 	 */
 	VERIFY0(ddt_object_info(ddt, type, class, &doi));
 	VERIFY0(ddt_object_count(ddt, type, class, &count));
 
 	ddo->ddo_count = count;
 	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 }
 
 static boolean_t
 ddt_object_exists(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
 	return (!!ddt->ddt_object[type][class]);
 }
 
 static int
 ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     ddt_entry_t *dde)
 {
 	if (!ddt_object_exists(ddt, type, class))
 		return (SET_ERROR(ENOENT));
 
 	return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
 	    ddt->ddt_object[type][class], &dde->dde_key,
 	    dde->dde_phys, sizeof (dde->dde_phys)));
 }
 
 static int
 ddt_object_contains(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     const ddt_key_t *ddk)
 {
 	if (!ddt_object_exists(ddt, type, class))
 		return (SET_ERROR(ENOENT));
 
 	return (ddt_ops[type]->ddt_op_contains(ddt->ddt_os,
 	    ddt->ddt_object[type][class], ddk));
 }
 
 static void
 ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     const ddt_key_t *ddk)
 {
 	if (!ddt_object_exists(ddt, type, class))
 		return;
 
 	ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
 	    ddt->ddt_object[type][class], ddk);
 }
 
 static int
 ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     ddt_entry_t *dde, dmu_tx_t *tx)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
 	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
 	    ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys,
 	    sizeof (dde->dde_phys), tx));
 }
 
 static int
 ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     const ddt_key_t *ddk, dmu_tx_t *tx)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
 	return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
 	    ddt->ddt_object[type][class], ddk, tx));
 }
 
 int
 ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     uint64_t *walk, ddt_entry_t *dde)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
 	return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
 	    ddt->ddt_object[type][class], walk, &dde->dde_key,
 	    dde->dde_phys, sizeof (dde->dde_phys)));
 }
 
 int
 ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     uint64_t *count)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
 	return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
 	    ddt->ddt_object[type][class], count));
 }
 
 int
 ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     dmu_object_info_t *doi)
 {
 	if (!ddt_object_exists(ddt, type, class))
 		return (SET_ERROR(ENOENT));
 
 	return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
 	    doi));
 }
 
 void
 ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     char *name)
 {
 	(void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT,
 	    zio_checksum_table[ddt->ddt_checksum].ci_name,
 	    ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
 }
 
 void
 ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
 {
 	ASSERT3U(txg, !=, 0);
 
 	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 		bp->blk_dva[d] = ddp->ddp_dva[d];
 	BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
 }
 
 /*
  * The bp created via this function may be used for repairs and scrub, but it
  * will be missing the salt / IV required to do a full decrypting read.
  */
 void
 ddt_bp_create(enum zio_checksum checksum,
     const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
 {
 	BP_ZERO(bp);
 
 	if (ddp != NULL)
 		ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
 
 	bp->blk_cksum = ddk->ddk_cksum;
 
 	BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
 	BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
 	BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
 	BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk));
 	BP_SET_FILL(bp, 1);
 	BP_SET_CHECKSUM(bp, checksum);
 	BP_SET_TYPE(bp, DMU_OT_DEDUP);
 	BP_SET_LEVEL(bp, 0);
 	BP_SET_DEDUP(bp, 1);
 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 }
 
 void
 ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
 {
 	ddk->ddk_cksum = bp->blk_cksum;
 	ddk->ddk_prop = 0;
 
 	ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp));
 
 	DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
 	DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
 	DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
 	DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp));
 }
 
 void
 ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
 {
 	ASSERT0(ddp->ddp_phys_birth);
 
 	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 		ddp->ddp_dva[d] = bp->blk_dva[d];
-	ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+	ddp->ddp_phys_birth = BP_GET_BIRTH(bp);
 }
 
 void
 ddt_phys_clear(ddt_phys_t *ddp)
 {
 	memset(ddp, 0, sizeof (*ddp));
 }
 
 void
 ddt_phys_addref(ddt_phys_t *ddp)
 {
 	ddp->ddp_refcnt++;
 }
 
 void
 ddt_phys_decref(ddt_phys_t *ddp)
 {
 	if (ddp) {
 		ASSERT3U(ddp->ddp_refcnt, >, 0);
 		ddp->ddp_refcnt--;
 	}
 }
 
 static void
 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
 {
 	blkptr_t blk;
 
 	ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 
 	/*
 	 * We clear the dedup bit so that zio_free() will actually free the
 	 * space, rather than just decrementing the refcount in the DDT.
 	 */
 	BP_SET_DEDUP(&blk, 0);
 
 	ddt_phys_clear(ddp);
 	zio_free(ddt->ddt_spa, txg, &blk);
 }
 
 ddt_phys_t *
 ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
 {
 	ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
 
 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
-		    BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+		    BP_GET_BIRTH(bp) == ddp->ddp_phys_birth)
 			return (ddp);
 	}
 	return (NULL);
 }
 
 uint64_t
 ddt_phys_total_refcnt(const ddt_entry_t *dde)
 {
 	uint64_t refcnt = 0;
 
 	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
 		refcnt += dde->dde_phys[p].ddp_refcnt;
 
 	return (refcnt);
 }
 
 ddt_t *
 ddt_select(spa_t *spa, const blkptr_t *bp)
 {
 	ASSERT(DDT_CHECKSUM_VALID(BP_GET_CHECKSUM(bp)));
 	return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
 }
 
 void
 ddt_enter(ddt_t *ddt)
 {
 	mutex_enter(&ddt->ddt_lock);
 }
 
 void
 ddt_exit(ddt_t *ddt)
 {
 	mutex_exit(&ddt->ddt_lock);
 }
 
 void
 ddt_init(void)
 {
 	ddt_cache = kmem_cache_create("ddt_cache",
 	    sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	ddt_entry_cache = kmem_cache_create("ddt_entry_cache",
 	    sizeof (ddt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 }
 
 void
 ddt_fini(void)
 {
 	kmem_cache_destroy(ddt_entry_cache);
 	kmem_cache_destroy(ddt_cache);
 }
 
 static ddt_entry_t *
 ddt_alloc(const ddt_key_t *ddk)
 {
 	ddt_entry_t *dde;
 
 	dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP);
 	memset(dde, 0, sizeof (ddt_entry_t));
 	cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
 
 	dde->dde_key = *ddk;
 
 	return (dde);
 }
 
 static void
 ddt_free(ddt_entry_t *dde)
 {
 	ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
 
 	for (int p = 0; p < DDT_PHYS_TYPES; p++)
 		ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
 
 	if (dde->dde_repair_abd != NULL)
 		abd_free(dde->dde_repair_abd);
 
 	cv_destroy(&dde->dde_cv);
 	kmem_cache_free(ddt_entry_cache, dde);
 }
 
 void
 ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 {
 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 
 	avl_remove(&ddt->ddt_tree, dde);
 	ddt_free(dde);
 }
 
 ddt_entry_t *
 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 {
 	ddt_key_t search;
 	ddt_entry_t *dde;
 	ddt_type_t type;
 	ddt_class_t class;
 	avl_index_t where;
 	int error;
 
 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 
 	ddt_key_fill(&search, bp);
 
 	/* Find an existing live entry */
 	dde = avl_find(&ddt->ddt_tree, &search, &where);
 	if (dde != NULL) {
 		/* Found it. If it's already loaded, we can just return it. */
 		if (dde->dde_flags & DDE_FLAG_LOADED)
 			return (dde);
 
 		/* Someone else is loading it, wait for it. */
 		while (!(dde->dde_flags & DDE_FLAG_LOADED))
 			cv_wait(&dde->dde_cv, &ddt->ddt_lock);
 
 		return (dde);
 	}
 
 	/* Not found. */
 	if (!add)
 		return (NULL);
 
 	/* Time to make a new entry. */
 	dde = ddt_alloc(&search);
 	avl_insert(&ddt->ddt_tree, dde, where);
 
 	/*
 	 * ddt_tree is now stable, so unlock and let everyone else keep moving.
 	 * Anyone landing on this entry will find it without DDE_FLAG_LOADED,
 	 * and go to sleep waiting for it above.
 	 */
 	ddt_exit(ddt);
 
 	/* Search all store objects for the entry. */
 	error = ENOENT;
 	for (type = 0; type < DDT_TYPES; type++) {
 		for (class = 0; class < DDT_CLASSES; class++) {
 			error = ddt_object_lookup(ddt, type, class, dde);
 			if (error != ENOENT) {
 				ASSERT0(error);
 				break;
 			}
 		}
 		if (error != ENOENT)
 			break;
 	}
 
 	ddt_enter(ddt);
 
 	ASSERT(!(dde->dde_flags & DDE_FLAG_LOADED));
 
 	dde->dde_type = type;	/* will be DDT_TYPES if no entry found */
 	dde->dde_class = class;	/* will be DDT_CLASSES if no entry found */
 
 	if (error == 0)
 		ddt_stat_update(ddt, dde, -1ULL);
 
 	/* Entry loaded, everyone can proceed now */
 	dde->dde_flags |= DDE_FLAG_LOADED;
 	cv_broadcast(&dde->dde_cv);
 
 	return (dde);
 }
 
 void
 ddt_prefetch(spa_t *spa, const blkptr_t *bp)
 {
 	ddt_t *ddt;
 	ddt_key_t ddk;
 
 	if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
 		return;
 
 	/*
 	 * We only remove the DDT once all tables are empty and only
 	 * prefetch dedup blocks when there are entries in the DDT.
 	 * Thus no locking is required as the DDT can't disappear on us.
 	 */
 	ddt = ddt_select(spa, bp);
 	ddt_key_fill(&ddk, bp);
 
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
 			ddt_object_prefetch(ddt, type, class, &ddk);
 		}
 	}
 }
 
 /*
  * Key comparison. Any struct wanting to make use of this function must have
  * the key as the first element.
  */
 #define	DDT_KEY_CMP_LEN	(sizeof (ddt_key_t) / sizeof (uint16_t))
 
 typedef struct ddt_key_cmp {
 	uint16_t	u16[DDT_KEY_CMP_LEN];
 } ddt_key_cmp_t;
 
 int
 ddt_key_compare(const void *x1, const void *x2)
 {
 	const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)x1;
 	const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)x2;
 	int32_t cmp = 0;
 
 	for (int i = 0; i < DDT_KEY_CMP_LEN; i++) {
 		cmp = (int32_t)k1->u16[i] - (int32_t)k2->u16[i];
 		if (likely(cmp))
 			break;
 	}
 
 	return (TREE_ISIGN(cmp));
 }
 
 static ddt_t *
 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 {
 	ddt_t *ddt;
 
 	ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP);
 	memset(ddt, 0, sizeof (ddt_t));
 
 	mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&ddt->ddt_tree, ddt_key_compare,
 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 	avl_create(&ddt->ddt_repair_tree, ddt_key_compare,
 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 	ddt->ddt_checksum = c;
 	ddt->ddt_spa = spa;
 	ddt->ddt_os = spa->spa_meta_objset;
 
 	return (ddt);
 }
 
 static void
 ddt_table_free(ddt_t *ddt)
 {
 	ASSERT0(avl_numnodes(&ddt->ddt_tree));
 	ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
 	avl_destroy(&ddt->ddt_tree);
 	avl_destroy(&ddt->ddt_repair_tree);
 	mutex_destroy(&ddt->ddt_lock);
 	kmem_cache_free(ddt_cache, ddt);
 }
 
 void
 ddt_create(spa_t *spa)
 {
 	spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
 
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		if (DDT_CHECKSUM_VALID(c))
 			spa->spa_ddt[c] = ddt_table_alloc(spa, c);
 	}
 }
 
 int
 ddt_load(spa_t *spa)
 {
 	int error;
 
 	ddt_create(spa);
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
 	    &spa->spa_ddt_stat_object);
 
 	if (error)
 		return (error == ENOENT ? 0 : error);
 
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		if (!DDT_CHECKSUM_VALID(c))
 			continue;
 
 		ddt_t *ddt = spa->spa_ddt[c];
 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 			for (ddt_class_t class = 0; class < DDT_CLASSES;
 			    class++) {
 				error = ddt_object_load(ddt, type, class);
 				if (error != 0 && error != ENOENT)
 					return (error);
 			}
 		}
 
 		/*
 		 * Seed the cached histograms.
 		 */
 		memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
 		    sizeof (ddt->ddt_histogram));
 		spa->spa_dedup_dspace = ~0ULL;
 	}
 
 	return (0);
 }
 
 void
 ddt_unload(spa_t *spa)
 {
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		if (spa->spa_ddt[c]) {
 			ddt_table_free(spa->spa_ddt[c]);
 			spa->spa_ddt[c] = NULL;
 		}
 	}
 }
 
 boolean_t
 ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp)
 {
 	ddt_t *ddt;
 	ddt_key_t ddk;
 
 	if (!BP_GET_DEDUP(bp))
 		return (B_FALSE);
 
 	if (max_class == DDT_CLASS_UNIQUE)
 		return (B_TRUE);
 
 	ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
 
 	ddt_key_fill(&ddk, bp);
 
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 		for (ddt_class_t class = 0; class <= max_class; class++) {
 			if (ddt_object_contains(ddt, type, class, &ddk) == 0)
 				return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 ddt_entry_t *
 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 {
 	ddt_key_t ddk;
 	ddt_entry_t *dde;
 
 	ddt_key_fill(&ddk, bp);
 
 	dde = ddt_alloc(&ddk);
 
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
 			/*
 			 * We can only do repair if there are multiple copies
 			 * of the block.  For anything in the UNIQUE class,
 			 * there's definitely only one copy, so don't even try.
 			 */
 			if (class != DDT_CLASS_UNIQUE &&
 			    ddt_object_lookup(ddt, type, class, dde) == 0)
 				return (dde);
 		}
 	}
 
 	memset(dde->dde_phys, 0, sizeof (dde->dde_phys));
 
 	return (dde);
 }
 
 void
 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
 {
 	avl_index_t where;
 
 	ddt_enter(ddt);
 
 	if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
 	    avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
 		avl_insert(&ddt->ddt_repair_tree, dde, where);
 	else
 		ddt_free(dde);
 
 	ddt_exit(ddt);
 }
 
 static void
 ddt_repair_entry_done(zio_t *zio)
 {
 	ddt_entry_t *rdde = zio->io_private;
 
 	ddt_free(rdde);
 }
 
 static void
 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
 {
 	ddt_phys_t *ddp = dde->dde_phys;
 	ddt_phys_t *rddp = rdde->dde_phys;
 	ddt_key_t *ddk = &dde->dde_key;
 	ddt_key_t *rddk = &rdde->dde_key;
 	zio_t *zio;
 	blkptr_t blk;
 
 	zio = zio_null(rio, rio->io_spa, NULL,
 	    ddt_repair_entry_done, rdde, rio->io_flags);
 
 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
 		if (ddp->ddp_phys_birth == 0 ||
 		    ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
 		    memcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
 			continue;
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 		zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
 		    rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
 		    ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
 	}
 
 	zio_nowait(zio);
 }
 
 static void
 ddt_repair_table(ddt_t *ddt, zio_t *rio)
 {
 	spa_t *spa = ddt->ddt_spa;
 	ddt_entry_t *dde, *rdde_next, *rdde;
 	avl_tree_t *t = &ddt->ddt_repair_tree;
 	blkptr_t blk;
 
 	if (spa_sync_pass(spa) > 1)
 		return;
 
 	ddt_enter(ddt);
 	for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
 		rdde_next = AVL_NEXT(t, rdde);
 		avl_remove(&ddt->ddt_repair_tree, rdde);
 		ddt_exit(ddt);
 		ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
 		dde = ddt_repair_start(ddt, &blk);
 		ddt_repair_entry(ddt, dde, rdde, rio);
 		ddt_repair_done(ddt, dde);
 		ddt_enter(ddt);
 	}
 	ddt_exit(ddt);
 }
 
 static void
 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 {
 	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
 	ddt_phys_t *ddp = dde->dde_phys;
 	ddt_key_t *ddk = &dde->dde_key;
 	ddt_type_t otype = dde->dde_type;
 	ddt_type_t ntype = DDT_TYPE_DEFAULT;
 	ddt_class_t oclass = dde->dde_class;
 	ddt_class_t nclass;
 	uint64_t total_refcnt = 0;
 
 	ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
 
 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 		ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
 		if (ddp->ddp_phys_birth == 0) {
 			ASSERT0(ddp->ddp_refcnt);
 			continue;
 		}
 		if (p == DDT_PHYS_DITTO) {
 			/*
 			 * Note, we no longer create DDT-DITTO blocks, but we
 			 * don't want to leak any written by older software.
 			 */
 			ddt_phys_free(ddt, ddk, ddp, txg);
 			continue;
 		}
 		if (ddp->ddp_refcnt == 0)
 			ddt_phys_free(ddt, ddk, ddp, txg);
 		total_refcnt += ddp->ddp_refcnt;
 	}
 
 	/* We do not create new DDT-DITTO blocks. */
 	ASSERT0(dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth);
 	if (total_refcnt > 1)
 		nclass = DDT_CLASS_DUPLICATE;
 	else
 		nclass = DDT_CLASS_UNIQUE;
 
 	if (otype != DDT_TYPES &&
 	    (otype != ntype || oclass != nclass || total_refcnt == 0)) {
 		VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx));
 		ASSERT3U(
 		    ddt_object_contains(ddt, otype, oclass, ddk), ==, ENOENT);
 	}
 
 	if (total_refcnt != 0) {
 		dde->dde_type = ntype;
 		dde->dde_class = nclass;
 		ddt_stat_update(ddt, dde, 0);
 		if (!ddt_object_exists(ddt, ntype, nclass))
 			ddt_object_create(ddt, ntype, nclass, tx);
 		VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx));
 
 		/*
 		 * If the class changes, the order that we scan this bp
 		 * changes.  If it decreases, we could miss it, so
 		 * scan it right now.  (This covers both class changing
 		 * while we are doing ddt_walk(), and when we are
 		 * traversing.)
 		 */
 		if (nclass < oclass) {
 			dsl_scan_ddt_entry(dp->dp_scan,
 			    ddt->ddt_checksum, dde, tx);
 		}
 	}
 }
 
 static void
 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
 {
 	spa_t *spa = ddt->ddt_spa;
 	ddt_entry_t *dde;
 	void *cookie = NULL;
 
 	if (avl_numnodes(&ddt->ddt_tree) == 0)
 		return;
 
 	ASSERT3U(spa->spa_uberblock.ub_version, >=, SPA_VERSION_DEDUP);
 
 	if (spa->spa_ddt_stat_object == 0) {
 		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
 		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_DDT_STATS, tx);
 	}
 
 	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
 		ddt_sync_entry(ddt, dde, tx, txg);
 		ddt_free(dde);
 	}
 
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 		uint64_t add, count = 0;
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
 			if (ddt_object_exists(ddt, type, class)) {
 				ddt_object_sync(ddt, type, class, tx);
 				VERIFY0(ddt_object_count(ddt, type, class,
 				    &add));
 				count += add;
 			}
 		}
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
 			if (count == 0 && ddt_object_exists(ddt, type, class))
 				ddt_object_destroy(ddt, type, class, tx);
 		}
 	}
 
 	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
 	    sizeof (ddt->ddt_histogram));
 	spa->spa_dedup_dspace = ~0ULL;
 }
 
 void
 ddt_sync(spa_t *spa, uint64_t txg)
 {
 	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 	dmu_tx_t *tx;
 	zio_t *rio;
 
 	ASSERT3U(spa_syncing_txg(spa), ==, txg);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	rio = zio_root(spa, NULL, NULL,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
 
 	/*
 	 * This function may cause an immediate scan of ddt blocks (see
 	 * the comment above dsl_scan_ddt() for details). We set the
 	 * scan's root zio here so that we can wait for any scan IOs in
 	 * addition to the regular ddt IOs.
 	 */
 	ASSERT3P(scn->scn_zio_root, ==, NULL);
 	scn->scn_zio_root = rio;
 
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (ddt == NULL)
 			continue;
 		ddt_sync_table(ddt, tx, txg);
 		ddt_repair_table(ddt, rio);
 	}
 
 	(void) zio_wait(rio);
 	scn->scn_zio_root = NULL;
 
 	dmu_tx_commit(tx);
 }
 
 int
 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
 {
 	do {
 		do {
 			do {
 				ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
 				if (ddt == NULL)
 					continue;
 				int error = ENOENT;
 				if (ddt_object_exists(ddt, ddb->ddb_type,
 				    ddb->ddb_class)) {
 					error = ddt_object_walk(ddt,
 					    ddb->ddb_type, ddb->ddb_class,
 					    &ddb->ddb_cursor, dde);
 				}
 				dde->dde_type = ddb->ddb_type;
 				dde->dde_class = ddb->ddb_class;
 				if (error == 0)
 					return (0);
 				if (error != ENOENT)
 					return (error);
 				ddb->ddb_cursor = 0;
 			} while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
 			ddb->ddb_checksum = 0;
 		} while (++ddb->ddb_type < DDT_TYPES);
 		ddb->ddb_type = 0;
 	} while (++ddb->ddb_class < DDT_CLASSES);
 
 	return (SET_ERROR(ENOENT));
 }
 
 /*
  * This function is used by Block Cloning (brt.c) to increase reference
  * counter for the DDT entry if the block is already in DDT.
  *
  * Return false if the block, despite having the D bit set, is not present
  * in the DDT. Currently this is not possible but might be in the future.
  * See the comment below.
  */
 boolean_t
 ddt_addref(spa_t *spa, const blkptr_t *bp)
 {
 	ddt_t *ddt;
 	ddt_entry_t *dde;
 	boolean_t result;
 
 	spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
 	ddt = ddt_select(spa, bp);
 	ddt_enter(ddt);
 
 	dde = ddt_lookup(ddt, bp, B_TRUE);
 	ASSERT3P(dde, !=, NULL);
 
 	if (dde->dde_type < DDT_TYPES) {
 		ddt_phys_t *ddp;
 
 		ASSERT3S(dde->dde_class, <, DDT_CLASSES);
 
 		ddp = &dde->dde_phys[BP_GET_NDVAS(bp)];
 
 		/*
 		 * This entry already existed (dde_type is real), so it must
 		 * have refcnt >0 at the start of this txg. We are called from
 		 * brt_pending_apply(), before frees are issued, so the refcnt
 		 * can't be lowered yet. Therefore, it must be >0. We assert
 		 * this because if the order of BRT and DDT interactions were
 		 * ever to change and the refcnt was ever zero here, then
 		 * likely further action is required to fill out the DDT entry,
 		 * and this is a place that is likely to be missed in testing.
 		 */
 		ASSERT3U(ddp->ddp_refcnt, >, 0);
 
 		ddt_phys_addref(ddp);
 		result = B_TRUE;
 	} else {
 		/*
 		 * At the time of implementating this if the block has the
 		 * DEDUP flag set it must exist in the DEDUP table, but
 		 * there are many advocates that want ability to remove
 		 * entries from DDT with refcnt=1. If this will happen,
 		 * we may have a block with the DEDUP set, but which doesn't
 		 * have a corresponding entry in the DDT. Be ready.
 		 */
 		ASSERT3S(dde->dde_class, ==, DDT_CLASSES);
 		ddt_remove(ddt, dde);
 		result = B_FALSE;
 	}
 
 	ddt_exit(ddt);
 	spa_config_exit(spa, SCL_ZIO, FTAG);
 
 	return (result);
 }
 
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
 	"Enable prefetching dedup-ed blks");
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 8986f55e792a..b88cf447d296 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1,2615 +1,2616 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2019 Datto Inc.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
  * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_prop.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/sa.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/brt.h>
 #include <sys/trace_zfs.h>
 #include <sys/zfs_racct.h>
 #include <sys/zfs_rlock.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <sys/zfs_znode.h>
 #endif
 
 /*
  * Enable/disable nopwrite feature.
  */
 static int zfs_nopwrite_enabled = 1;
 
 /*
  * Tunable to control percentage of dirtied L1 blocks from frees allowed into
  * one TXG. After this threshold is crossed, additional dirty blocks from frees
  * will wait until the next TXG.
  * A value of zero will disable this throttle.
  */
 static uint_t zfs_per_txg_dirty_frees_percent = 30;
 
 /*
  * Enable/disable forcing txg sync when dirty checking for holes with lseek().
  * By default this is enabled to ensure accurate hole reporting, it can result
  * in a significant performance penalty for lseek(SEEK_HOLE) heavy workloads.
  * Disabling this option will result in holes never being reported in dirty
  * files which is always safe.
  */
 static int zfs_dmu_offset_next_sync = 1;
 
 /*
  * Limit the amount we can prefetch with one call to this amount.  This
  * helps to limit the amount of memory that can be used by prefetching.
  * Larger objects should be prefetched a bit at a time.
  */
 #ifdef _ILP32
 uint_t dmu_prefetch_max = 8 * 1024 * 1024;
 #else
 uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
 #endif
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"		},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "object directory"	},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "object array"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "packed nvlist"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "packed nvlist size"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj"			},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj header"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA space map header"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA space map"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, TRUE,  "ZIL intent log"	},
 	{DMU_BSWAP_DNODE,  TRUE,  FALSE, TRUE,  "DMU dnode"		},
 	{DMU_BSWAP_OBJSET, TRUE,  TRUE,  FALSE, "DMU objset"		},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL directory"		},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL directory child map"},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dataset snap map"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL props"		},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL dataset"		},
 	{DMU_BSWAP_ZNODE,  TRUE,  FALSE, FALSE, "ZFS znode"		},
 	{DMU_BSWAP_OLDACL, TRUE,  FALSE, TRUE,  "ZFS V0 ACL"		},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "ZFS plain file"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS directory"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "ZFS master node"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS delete queue"	},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "zvol object"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "zvol prop"		},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "other uint8[]"		},
 	{DMU_BSWAP_UINT64, FALSE, FALSE, TRUE,  "other uint64[]"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "other ZAP"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "persistent error log"	},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "SPA history"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA history offsets"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "Pool properties"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL permissions"	},
 	{DMU_BSWAP_ACL,    TRUE,  FALSE, TRUE,  "ZFS ACL"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,  "ZFS SYSACL"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,  "FUID table"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "FUID table size"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dataset next clones"},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "scan work queue"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS user/group/project used" },
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS user/group/project quota"},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "snapshot refcount tags"},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "DDT ZAP algorithm"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "DDT statistics"	},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,	"System attributes"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA master node"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA attr registration"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA attr layouts"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "scan translations"	},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "deduplicated block"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL deadlist map"	},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL deadlist map hdr"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dir clones"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj subobj"		}
 };
 
 dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
 	{	byteswap_uint8_array,	"uint8"		},
 	{	byteswap_uint16_array,	"uint16"	},
 	{	byteswap_uint32_array,	"uint32"	},
 	{	byteswap_uint64_array,	"uint64"	},
 	{	zap_byteswap,		"zap"		},
 	{	dnode_buf_byteswap,	"dnode"		},
 	{	dmu_objset_byteswap,	"objset"	},
 	{	zfs_znode_byteswap,	"znode"		},
 	{	zfs_oldacl_byteswap,	"oldacl"	},
 	{	zfs_acl_byteswap,	"acl"		}
 };
 
 int
 dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
     const void *tag, dmu_buf_t **dbp)
 {
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 
 	*dbp = &db->db;
 	return (0);
 }
 
 int
 dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
     const void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 
 	*dbp = &db->db;
 	return (err);
 }
 
 int
 dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
     const void *tag, dmu_buf_t **dbp, int flags)
 {
 	int err;
 	int db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
 	if (err == 0) {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 		err = dbuf_read(db, NULL, db_flags);
 		if (err != 0) {
 			dbuf_rele(db, tag);
 			*dbp = NULL;
 		}
 	}
 
 	return (err);
 }
 
 int
 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     const void *tag, dmu_buf_t **dbp, int flags)
 {
 	int err;
 	int db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
 	if (err == 0) {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 		err = dbuf_read(db, NULL, db_flags);
 		if (err != 0) {
 			dbuf_rele(db, tag);
 			*dbp = NULL;
 		}
 	}
 
 	return (err);
 }
 
 int
 dmu_bonus_max(void)
 {
 	return (DN_OLD_MAX_BONUSLEN);
 }
 
 int
 dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (dn->dn_bonus != db) {
 		error = SET_ERROR(EINVAL);
 	} else if (newsize < 0 || newsize > db_fake->db_size) {
 		error = SET_ERROR(EINVAL);
 	} else {
 		dnode_setbonuslen(dn, newsize, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 int
 dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (!DMU_OT_IS_VALID(type)) {
 		error = SET_ERROR(EINVAL);
 	} else if (dn->dn_bonus != db) {
 		error = SET_ERROR(EINVAL);
 	} else {
 		dnode_setbonus_type(dn, type, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 dmu_object_type_t
 dmu_get_bonustype(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	dmu_object_type_t type;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	type = dn->dn_bonustype;
 	DB_DNODE_EXIT(db);
 
 	return (type);
 }
 
 int
 dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int error;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	dbuf_rm_spill(dn, tx);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_rm_spill(dn, tx);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (error);
 }
 
 /*
  * Lookup and hold the bonus buffer for the provided dnode.  If the dnode
  * has not yet been allocated a new bonus dbuf a will be allocated.
  * Returns ENOENT, EIO, or 0.
  */
 int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
     uint32_t flags)
 {
 	dmu_buf_impl_t *db;
 	int error;
 	uint32_t db_flags = DB_RF_MUST_SUCCEED;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_bonus == NULL) {
 		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
 			rw_exit(&dn->dn_struct_rwlock);
 			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		}
 		if (dn->dn_bonus == NULL)
 			dbuf_create_bonus(dn);
 	}
 	db = dn->dn_bonus;
 
 	/* as long as the bonus buf is held, the dnode will be held */
 	if (zfs_refcount_add(&db->db_holds, tag) == 1) {
 		VERIFY(dnode_add_ref(dn, db));
 		atomic_inc_32(&dn->dn_dbufs_count);
 	}
 
 	/*
 	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
 	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
 	 * a dnode hold for every dbuf.
 	 */
 	rw_exit(&dn->dn_struct_rwlock);
 
 	error = dbuf_read(db, NULL, db_flags);
 	if (error) {
 		dnode_evict_bonus(dn);
 		dbuf_rele(db, tag);
 		*dbp = NULL;
 		return (error);
 	}
 
 	*dbp = &db->db;
 	return (0);
 }
 
 int
 dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	int error;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	if (error)
 		return (error);
 
 	error = dmu_bonus_hold_by_dnode(dn, tag, dbp, DMU_READ_NO_PREFETCH);
 	dnode_rele(dn, FTAG);
 
 	return (error);
 }
 
 /*
  * returns ENOENT, EIO, or 0.
  *
  * This interface will allocate a blank spill dbuf when a spill blk
  * doesn't already exist on the dnode.
  *
  * if you only want to find an already existing spill db, then
  * dmu_spill_hold_existing() should be used.
  */
 int
 dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, const void *tag,
     dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = NULL;
 	int err;
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_exit(&dn->dn_struct_rwlock);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 	err = dbuf_read(db, NULL, flags);
 	if (err == 0)
 		*dbp = &db->db;
 	else {
 		dbuf_rele(db, tag);
 		*dbp = NULL;
 	}
 	return (err);
 }
 
 int
 dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
 		err = SET_ERROR(EINVAL);
 	} else {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 		if (!dn->dn_have_spill) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			err = dmu_spill_hold_by_dnode(dn,
 			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
 		}
 
 		rw_exit(&dn->dn_struct_rwlock);
 	}
 
 	DB_DNODE_EXIT(db);
 	return (err);
 }
 
 int
 dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag,
     dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	dnode_t *dn;
 	int err;
 	uint32_t db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_spill_hold_by_dnode(dn, db_flags, tag, dbp);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
  * to take a held dnode rather than <os, object> -- the lookup is wasteful,
  * and can induce severe lock contention when writing to several files
  * whose dnodes are in the same block.
  */
 int
 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
     boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp,
     uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	zstream_t *zs = NULL;
 	uint64_t blkid, nblks, i;
 	uint32_t dbuf_flags;
 	int err;
 	zio_t *zio = NULL;
 	boolean_t missed = B_FALSE;
 
 	ASSERT(!read || length <= DMU_MAX_ACCESS);
 
 	/*
 	 * Note: We directly notify the prefetch code of this read, so that
 	 * we can tell it about the multi-block read.  dbuf_read() only knows
 	 * about the one block it is accessing.
 	 */
 	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
 	    DB_RF_NOPREFETCH;
 
 	if ((flags & DMU_READ_NO_DECRYPT) != 0)
 		dbuf_flags |= DB_RF_NO_DECRYPT;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
 		int blkshift = dn->dn_datablkshift;
 		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
 		    P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
 	} else {
 		if (offset + length > dn->dn_datablksz) {
 			zfs_panic_recover("zfs: accessing past end of object "
 			    "%llx/%llx (size=%u access=%llu+%llu)",
 			    (longlong_t)dn->dn_objset->
 			    os_dsl_dataset->ds_object,
 			    (longlong_t)dn->dn_object, dn->dn_datablksz,
 			    (longlong_t)offset, (longlong_t)length);
 			rw_exit(&dn->dn_struct_rwlock);
 			return (SET_ERROR(EIO));
 		}
 		nblks = 1;
 	}
 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
 	if (read)
 		zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	if ((flags & DMU_READ_NO_PREFETCH) == 0) {
 		/*
 		 * Prepare the zfetch before initiating the demand reads, so
 		 * that if multiple threads block on same indirect block, we
 		 * base predictions on the original less racy request order.
 		 */
 		zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, read,
 		    B_TRUE);
 	}
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
 		if (db == NULL) {
 			if (zs)
 				dmu_zfetch_run(zs, missed, B_TRUE);
 			rw_exit(&dn->dn_struct_rwlock);
 			dmu_buf_rele_array(dbp, nblks, tag);
 			if (read)
 				zio_nowait(zio);
 			return (SET_ERROR(EIO));
 		}
 
 		/*
 		 * Initiate async demand data read.
 		 * We check the db_state after calling dbuf_read() because
 		 * (1) dbuf_read() may change the state to CACHED due to a
 		 * hit in the ARC, and (2) on a cache miss, a child will
 		 * have been added to "zio" but not yet completed, so the
 		 * state will not yet be CACHED.
 		 */
 		if (read) {
 			if (i == nblks - 1 && blkid + i < dn->dn_maxblkid &&
 			    offset + length < db->db.db_offset +
 			    db->db.db_size) {
 				if (offset <= db->db.db_offset)
 					dbuf_flags |= DB_RF_PARTIAL_FIRST;
 				else
 					dbuf_flags |= DB_RF_PARTIAL_MORE;
 			}
 			(void) dbuf_read(db, zio, dbuf_flags);
 			if (db->db_state != DB_CACHED)
 				missed = B_TRUE;
 		}
 		dbp[i] = &db->db;
 	}
 
 	if (!read)
 		zfs_racct_write(length, nblks);
 
 	if (zs)
 		dmu_zfetch_run(zs, missed, B_TRUE);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (read) {
 		/* wait for async read i/o */
 		err = zio_wait(zio);
 		if (err) {
 			dmu_buf_rele_array(dbp, nblks, tag);
 			return (err);
 		}
 
 		/* wait for other io to complete */
 		for (i = 0; i < nblks; i++) {
 			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
 			mutex_enter(&db->db_mtx);
 			while (db->db_state == DB_READ ||
 			    db->db_state == DB_FILL)
 				cv_wait(&db->db_changed, &db->db_mtx);
 			if (db->db_state == DB_UNCACHED)
 				err = SET_ERROR(EIO);
 			mutex_exit(&db->db_mtx);
 			if (err) {
 				dmu_buf_rele_array(dbp, nblks, tag);
 				return (err);
 			}
 		}
 	}
 
 	*numbufsp = nblks;
 	*dbpp = dbp;
 	return (0);
 }
 
 int
 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, int read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 	    numbufsp, dbpp, DMU_READ_PREFETCH);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
     uint64_t length, boolean_t read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 	    numbufsp, dbpp, DMU_READ_PREFETCH);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 void
 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag)
 {
 	int i;
 	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
 
 	if (numbufs == 0)
 		return;
 
 	for (i = 0; i < numbufs; i++) {
 		if (dbp[i])
 			dbuf_rele(dbp[i], tag);
 	}
 
 	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 }
 
 /*
  * Issue prefetch I/Os for the given blocks.  If level is greater than 0, the
  * indirect blocks prefetched will be those that point to the blocks containing
  * the data starting at offset, and continuing to offset + len.  If the range
  * it too long, prefetch the first dmu_prefetch_max bytes as requested, while
  * for the rest only a higher level, also fitting within dmu_prefetch_max.  It
  * should primarily help random reads, since for long sequential reads there is
  * a speculative prefetcher.
  *
  * Note that if the indirect blocks above the blocks being prefetched are not
  * in cache, they will be asynchronously read in.  Dnode read by dnode_hold()
  * is currently synchronous.
  */
 void
 dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
 
 	if (dmu_prefetch_max == 0 || len == 0) {
 		dmu_prefetch_dnode(os, object, pri);
 		return;
 	}
 
 	if (dnode_hold(os, object, FTAG, &dn) != 0)
 		return;
 
 	dmu_prefetch_by_dnode(dn, level, offset, len, pri);
 
 	dnode_rele(dn, FTAG);
 }
 
 void
 dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	int64_t level2 = level;
 	uint64_t start, end, start2, end2;
 
 	/*
 	 * Depending on len we may do two prefetches: blocks [start, end) at
 	 * level, and following blocks [start2, end2) at higher level2.
 	 */
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift != 0) {
 		/*
 		 * The object has multiple blocks.  Calculate the full range
 		 * of blocks [start, end2) and then split it into two parts,
 		 * so that the first [start, end) fits into dmu_prefetch_max.
 		 */
 		start = dbuf_whichblock(dn, level, offset);
 		end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1;
 		uint8_t ibs = dn->dn_indblkshift;
 		uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs;
 		uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs;
 		start2 = end = MIN(end2, start + limit);
 
 		/*
 		 * Find level2 where [start2, end2) fits into dmu_prefetch_max.
 		 */
 		uint8_t ibps = ibs - SPA_BLKPTRSHIFT;
 		limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs;
 		do {
 			level2++;
 			start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps;
 			end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps;
 		} while (end2 - start2 > limit);
 	} else {
 		/* There is only one block.  Prefetch it or nothing. */
 		start = start2 = end2 = 0;
 		end = start + (level == 0 && offset < dn->dn_datablksz);
 	}
 
 	for (uint64_t i = start; i < end; i++)
 		dbuf_prefetch(dn, level, i, pri, 0);
 	for (uint64_t i = start2; i < end2; i++)
 		dbuf_prefetch(dn, level2, i, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /*
  * Issue prefetch I/Os for the given object's dnode.
  */
 void
 dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
 {
 	if (object == 0 || object >= DN_MAX_OBJECT)
 		return;
 
 	dnode_t *dn = DMU_META_DNODE(os);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t));
 	dbuf_prefetch(dn, 0, blkid, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /*
  * Get the next "chunk" of file data to free.  We traverse the file from
  * the end so that the file gets shorter over time (if we crashes in the
  * middle, this will leave us in a better state).  We find allocated file
  * data by simply searching the allocated level 1 indirects.
  *
  * On input, *start should be the first offset that does not need to be
  * freed (e.g. "offset + length").  On return, *start will be the first
  * offset that should be freed and l1blks is set to the number of level 1
  * indirect blocks found within the chunk.
  */
 static int
 get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
 {
 	uint64_t blks;
 	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
 	/* bytes of data covered by a level-1 indirect block */
 	uint64_t iblkrange = (uint64_t)dn->dn_datablksz *
 	    EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
 
 	ASSERT3U(minimum, <=, *start);
 
 	/*
 	 * Check if we can free the entire range assuming that all of the
 	 * L1 blocks in this range have data. If we can, we use this
 	 * worst case value as an estimate so we can avoid having to look
 	 * at the object's actual data.
 	 */
 	uint64_t total_l1blks =
 	    (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) /
 	    iblkrange;
 	if (total_l1blks <= maxblks) {
 		*l1blks = total_l1blks;
 		*start = minimum;
 		return (0);
 	}
 	ASSERT(ISP2(iblkrange));
 
 	for (blks = 0; *start > minimum && blks < maxblks; blks++) {
 		int err;
 
 		/*
 		 * dnode_next_offset(BACKWARDS) will find an allocated L1
 		 * indirect block at or before the input offset.  We must
 		 * decrement *start so that it is at the end of the region
 		 * to search.
 		 */
 		(*start)--;
 
 		err = dnode_next_offset(dn,
 		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
 
 		/* if there are no indirect blocks before start, we are done */
 		if (err == ESRCH) {
 			*start = minimum;
 			break;
 		} else if (err != 0) {
 			*l1blks = blks;
 			return (err);
 		}
 
 		/* set start to the beginning of this L1 indirect */
 		*start = P2ALIGN(*start, iblkrange);
 	}
 	if (*start < minimum)
 		*start = minimum;
 	*l1blks = blks;
 
 	return (0);
 }
 
 /*
  * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
  * otherwise return false.
  * Used below in dmu_free_long_range_impl() to enable abort when unmounting
  */
 static boolean_t
 dmu_objset_zfs_unmounting(objset_t *os)
 {
 #ifdef _KERNEL
 	if (dmu_objset_type(os) == DMU_OST_ZFS)
 		return (zfs_get_vfs_flag_unmounted(os));
 #else
 	(void) os;
 #endif
 	return (B_FALSE);
 }
 
 static int
 dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
     uint64_t length)
 {
 	uint64_t object_size;
 	int err;
 	uint64_t dirty_frees_threshold;
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
 	if (dn == NULL)
 		return (SET_ERROR(EINVAL));
 
 	object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	if (offset >= object_size)
 		return (0);
 
 	if (zfs_per_txg_dirty_frees_percent <= 100)
 		dirty_frees_threshold =
 		    zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
 	else
 		dirty_frees_threshold = zfs_dirty_data_max / 20;
 
 	if (length == DMU_OBJECT_END || offset + length > object_size)
 		length = object_size - offset;
 
 	while (length != 0) {
 		uint64_t chunk_end, chunk_begin, chunk_len;
 		uint64_t l1blks;
 		dmu_tx_t *tx;
 
 		if (dmu_objset_zfs_unmounting(dn->dn_objset))
 			return (SET_ERROR(EINTR));
 
 		chunk_end = chunk_begin = offset + length;
 
 		/* move chunk_begin backwards to the beginning of this chunk */
 		err = get_next_chunk(dn, &chunk_begin, offset, &l1blks);
 		if (err)
 			return (err);
 		ASSERT3U(chunk_begin, >=, offset);
 		ASSERT3U(chunk_begin, <=, chunk_end);
 
 		chunk_len = chunk_end - chunk_begin;
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
 
 		/*
 		 * Mark this transaction as typically resulting in a net
 		 * reduction in space used.
 		 */
 		dmu_tx_mark_netfree(tx);
 		err = dmu_tx_assign(tx, TXG_WAIT);
 		if (err) {
 			dmu_tx_abort(tx);
 			return (err);
 		}
 
 		uint64_t txg = dmu_tx_get_txg(tx);
 
 		mutex_enter(&dp->dp_lock);
 		uint64_t long_free_dirty =
 		    dp->dp_long_free_dirty_pertxg[txg & TXG_MASK];
 		mutex_exit(&dp->dp_lock);
 
 		/*
 		 * To avoid filling up a TXG with just frees, wait for
 		 * the next TXG to open before freeing more chunks if
 		 * we have reached the threshold of frees.
 		 */
 		if (dirty_frees_threshold != 0 &&
 		    long_free_dirty >= dirty_frees_threshold) {
 			DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay);
 			dmu_tx_commit(tx);
 			txg_wait_open(dp, 0, B_TRUE);
 			continue;
 		}
 
 		/*
 		 * In order to prevent unnecessary write throttling, for each
 		 * TXG, we track the cumulative size of L1 blocks being dirtied
 		 * in dnode_free_range() below. We compare this number to a
 		 * tunable threshold, past which we prevent new L1 dirty freeing
 		 * blocks from being added into the open TXG. See
 		 * dmu_free_long_range_impl() for details. The threshold
 		 * prevents write throttle activation due to dirty freeing L1
 		 * blocks taking up a large percentage of zfs_dirty_data_max.
 		 */
 		mutex_enter(&dp->dp_lock);
 		dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] +=
 		    l1blks << dn->dn_indblkshift;
 		mutex_exit(&dp->dp_lock);
 		DTRACE_PROBE3(free__long__range,
 		    uint64_t, long_free_dirty, uint64_t, chunk_len,
 		    uint64_t, txg);
 		dnode_free_range(dn, chunk_begin, chunk_len, tx);
 
 		dmu_tx_commit(tx);
 
 		length -= chunk_len;
 	}
 	return (0);
 }
 
 int
 dmu_free_long_range(objset_t *os, uint64_t object,
     uint64_t offset, uint64_t length)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 	err = dmu_free_long_range_impl(os, dn, offset, length);
 
 	/*
 	 * It is important to zero out the maxblkid when freeing the entire
 	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
 	 * will take the fast path, and (b) dnode_reallocate() can verify
 	 * that the entire file has been freed.
 	 */
 	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
 		dn->dn_maxblkid = 0;
 
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_free_long_object(objset_t *os, uint64_t object)
 {
 	dmu_tx_t *tx;
 	int err;
 
 	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
 	if (err != 0)
 		return (err);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_bonus(tx, object);
 	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
 	dmu_tx_mark_netfree(tx);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err == 0) {
 		err = dmu_object_free(os, object, tx);
 		dmu_tx_commit(tx);
 	} else {
 		dmu_tx_abort(tx);
 	}
 
 	return (err);
 }
 
 int
 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	ASSERT(offset < UINT64_MAX);
 	ASSERT(size == DMU_OBJECT_END || size <= UINT64_MAX - offset);
 	dnode_free_range(dn, offset, size, tx);
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 static int
 dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
     void *buf, uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	int numbufs, err = 0;
 
 	/*
 	 * Deal with odd block sizes, where there can't be data past the first
 	 * block.  If we ever do the tail block optimization, we will need to
 	 * handle that here as well.
 	 */
 	if (dn->dn_maxblkid == 0) {
 		uint64_t newsz = offset > dn->dn_datablksz ? 0 :
 		    MIN(size, dn->dn_datablksz - offset);
 		memset((char *)buf + newsz, 0, size - newsz);
 		size = newsz;
 	}
 
 	while (size > 0) {
 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
 		int i;
 
 		/*
 		 * NB: we could do this block-at-a-time, but it's nice
 		 * to be reading in parallel.
 		 */
 		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
 		    TRUE, FTAG, &numbufs, &dbp, flags);
 		if (err)
 			break;
 
 		for (i = 0; i < numbufs; i++) {
 			uint64_t tocpy;
 			int64_t bufoff;
 			dmu_buf_t *db = dbp[i];
 
 			ASSERT(size > 0);
 
 			bufoff = offset - db->db_offset;
 			tocpy = MIN(db->db_size - bufoff, size);
 
 			(void) memcpy(buf, (char *)db->db_data + bufoff, tocpy);
 
 			offset += tocpy;
 			size -= tocpy;
 			buf = (char *)buf + tocpy;
 		}
 		dmu_buf_rele_array(dbp, numbufs, FTAG);
 	}
 	return (err);
 }
 
 int
 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     void *buf, uint32_t flags)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 
 	err = dmu_read_impl(dn, offset, size, buf, flags);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
     uint32_t flags)
 {
 	return (dmu_read_impl(dn, offset, size, buf, flags));
 }
 
 static void
 dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	int i;
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = offset - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx, B_FALSE);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx, B_FALSE);
 
 		offset += tocpy;
 		size -= tocpy;
 		buf = (char *)buf + tocpy;
 	}
 }
 
 void
 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 
 	if (size == 0)
 		return;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 /*
  * Note: Lustre is an external consumer of this interface.
  */
 void
 dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 
 	if (size == 0)
 		return;
 
 	VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 void
 dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i;
 
 	if (size == 0)
 		return;
 
 	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 
 	for (i = 0; i < numbufs; i++) {
 		dmu_buf_t *db = dbp[i];
 
 		dmu_buf_will_not_fill(db, tx);
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 void
 dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
     void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
     int compressed_size, int byteorder, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 
 	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
 	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
 	VERIFY0(dmu_buf_hold_noread(os, object, offset,
 	    FTAG, &db));
 
 	dmu_buf_write_embedded(db,
 	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
 	    uncompressed_size, compressed_size, byteorder, tx);
 
 	dmu_buf_rele(db, FTAG);
 }
 
 void
 dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx)
 {
 	int numbufs, i;
 	dmu_buf_t **dbp;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG,
 	    &numbufs, &dbp));
 	for (i = 0; i < numbufs; i++)
 		dmu_buf_redact(dbp[i], tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 #ifdef _KERNEL
 int
 dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i, err;
 
 	/*
 	 * NB: we could do this block-at-a-time, but it's nice
 	 * to be reading in parallel.
 	 */
 	err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
 	    TRUE, FTAG, &numbufs, &dbp, 0);
 	if (err)
 		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = zfs_uio_offset(uio) - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy,
 		    UIO_READ, uio);
 
 		if (err)
 			break;
 
 		size -= tocpy;
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (err);
 }
 
 /*
  * Read 'size' bytes into the uio buffer.
  * From object zdb->db_object.
  * Starting at zfs_uio_offset(uio).
  *
  * If the caller already has a dbuf in the target object
  * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
  * because we don't have to find the dnode_t for the object.
  */
 int
 dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_read_uio_dnode(dn, uio, size);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Read 'size' bytes into the uio buffer.
  * From the specified object
  * Starting at offset zfs_uio_offset(uio).
  */
 int
 dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size)
 {
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_read_uio_dnode(dn, uio, size);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 	int err = 0;
 	int i;
 
 	err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
 	if (err)
 		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		offset_t off = zfs_uio_offset(uio);
 		bufoff = off - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx, B_TRUE);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		err = zfs_uio_fault_move((char *)db->db_data + bufoff,
 		    tocpy, UIO_WRITE, uio);
 
 		if (tocpy == db->db_size && dmu_buf_fill_done(db, tx, err)) {
 			/* The fill was reverted.  Undo any uio progress. */
 			zfs_uio_advance(uio, off - zfs_uio_offset(uio));
 		}
 
 		if (err)
 			break;
 
 		size -= tocpy;
 	}
 
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
 
 /*
  * Write 'size' bytes from the uio buffer.
  * To object zdb->db_object.
  * Starting at offset zfs_uio_offset(uio).
  *
  * If the caller already has a dbuf in the target object
  * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
  * because we don't have to find the dnode_t for the object.
  */
 int
 dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_write_uio_dnode(dn, uio, size, tx);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Write 'size' bytes from the uio buffer.
  * To the specified object.
  * Starting at offset zfs_uio_offset(uio).
  */
 int
 dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_write_uio_dnode(dn, uio, size, tx);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 #endif /* _KERNEL */
 
 /*
  * Allocate a loaned anonymous arc buffer.
  */
 arc_buf_t *
 dmu_request_arcbuf(dmu_buf_t *handle, int size)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
 
 	return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
 }
 
 /*
  * Free a loaned arc buffer.
  */
 void
 dmu_return_arcbuf(arc_buf_t *buf)
 {
 	arc_return_buf(buf, FTAG);
 	arc_buf_destroy(buf, FTAG);
 }
 
 /*
  * A "lightweight" write is faster than a regular write (e.g.
  * dmu_write_by_dnode() or dmu_assign_arcbuf_by_dnode()), because it avoids the
  * CPU cost of creating a dmu_buf_impl_t and arc_buf_[hdr_]_t.  However, the
  * data can not be read or overwritten until the transaction's txg has been
  * synced.  This makes it appropriate for workloads that are known to be
  * (temporarily) write-only, like "zfs receive".
  *
  * A single block is written, starting at the specified offset in bytes.  If
  * the call is successful, it returns 0 and the provided abd has been
  * consumed (the caller should not free it).
  */
 int
 dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd,
     const zio_prop_t *zp, zio_flag_t flags, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr =
 	    dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx);
 	if (dr == NULL)
 		return (SET_ERROR(EIO));
 	dr->dt.dll.dr_abd = abd;
 	dr->dt.dll.dr_props = *zp;
 	dr->dt.dll.dr_flags = flags;
 	return (0);
 }
 
 /*
  * When possible directly assign passed loaned arc buffer to a dbuf.
  * If this is not possible copy the contents of passed arc buf via
  * dmu_write().
  */
 int
 dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db;
 	objset_t *os = dn->dn_objset;
 	uint64_t object = dn->dn_object;
 	uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
 	uint64_t blkid;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, FTAG);
 	rw_exit(&dn->dn_struct_rwlock);
 	if (db == NULL)
 		return (SET_ERROR(EIO));
 
 	/*
 	 * We can only assign if the offset is aligned and the arc buf is the
 	 * same size as the dbuf.
 	 */
 	if (offset == db->db.db_offset && blksz == db->db.db_size) {
 		zfs_racct_write(blksz, 1);
 		dbuf_assign_arcbuf(db, buf, tx);
 		dbuf_rele(db, FTAG);
 	} else {
 		/* compressed bufs must always be assignable to their dbuf */
 		ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
 		ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
 
 		dbuf_rele(db, FTAG);
 		dmu_write(os, object, offset, blksz, buf->b_data, tx);
 		dmu_return_arcbuf(buf);
 	}
 
 	return (0);
 }
 
 int
 dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
 	int err;
 	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
 
 	DB_DNODE_ENTER(dbuf);
 	err = dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx);
 	DB_DNODE_EXIT(dbuf);
 
 	return (err);
 }
 
 typedef struct {
 	dbuf_dirty_record_t	*dsa_dr;
 	dmu_sync_cb_t		*dsa_done;
 	zgd_t			*dsa_zgd;
 	dmu_tx_t		*dsa_tx;
 } dmu_sync_arg_t;
 
 static void
 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	(void) buf;
 	dmu_sync_arg_t *dsa = varg;
 	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_error == 0) {
 		if (BP_IS_HOLE(bp)) {
 			/*
 			 * A block of zeros may compress to a hole, but the
 			 * block size still needs to be known for replay.
 			 */
 			BP_SET_LSIZE(bp, db->db_size);
 		} else if (!BP_IS_EMBEDDED(bp)) {
 			ASSERT(BP_GET_LEVEL(bp) == 0);
 			BP_SET_FILL(bp, 1);
 		}
 	}
 }
 
 static void
 dmu_sync_late_arrival_ready(zio_t *zio)
 {
 	dmu_sync_ready(zio, NULL, zio->io_private);
 }
 
 static void
 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	(void) buf;
 	dmu_sync_arg_t *dsa = varg;
 	dbuf_dirty_record_t *dr = dsa->dsa_dr;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	zgd_t *zgd = dsa->dsa_zgd;
 
 	/*
 	 * Record the vdev(s) backing this blkptr so they can be flushed after
 	 * the writes for the lwb have completed.
 	 */
 	if (zio->io_error == 0) {
 		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 	}
 
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
 	if (zio->io_error == 0) {
 		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
 		if (dr->dt.dl.dr_nopwrite) {
 			blkptr_t *bp = zio->io_bp;
 			blkptr_t *bp_orig = &zio->io_bp_orig;
 			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
 
 			ASSERT(BP_EQUAL(bp, bp_orig));
 			VERIFY(BP_EQUAL(bp, db->db_blkptr));
 			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
 			VERIFY(zio_checksum_table[chksum].ci_flags &
 			    ZCHECKSUM_FLAG_NOPWRITE);
 		}
 		dr->dt.dl.dr_overridden_by = *zio->io_bp;
 		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
 		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
 
 		/*
 		 * Old style holes are filled with all zeros, whereas
 		 * new-style holes maintain their lsize, type, level,
 		 * and birth time (see zio_write_compress). While we
 		 * need to reset the BP_SET_LSIZE() call that happened
 		 * in dmu_sync_ready for old style holes, we do *not*
 		 * want to wipe out the information contained in new
 		 * style holes. Thus, only zero out the block pointer if
 		 * it's an old style hole.
 		 */
 		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
-		    dr->dt.dl.dr_overridden_by.blk_birth == 0)
+		    BP_GET_LOGICAL_BIRTH(&dr->dt.dl.dr_overridden_by) == 0)
 			BP_ZERO(&dr->dt.dl.dr_overridden_by);
 	} else {
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	}
 	cv_broadcast(&db->db_changed);
 	mutex_exit(&db->db_mtx);
 
 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static void
 dmu_sync_late_arrival_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	dmu_sync_arg_t *dsa = zio->io_private;
 	zgd_t *zgd = dsa->dsa_zgd;
 
 	if (zio->io_error == 0) {
 		/*
 		 * Record the vdev(s) backing this blkptr so they can be
 		 * flushed after the writes for the lwb have completed.
 		 */
 		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 
 		if (!BP_IS_HOLE(bp)) {
 			blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
 			ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
 			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
-			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+			ASSERT(BP_GET_LOGICAL_BIRTH(zio->io_bp) == zio->io_txg);
 			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
 			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
 		}
 	}
 
 	dmu_tx_commit(dsa->dsa_tx);
 
 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	abd_free(zio->io_abd);
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static int
 dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
     zio_prop_t *zp, zbookmark_phys_t *zb)
 {
 	dmu_sync_arg_t *dsa;
 	dmu_tx_t *tx;
 	int error;
 
 	error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL,
 	    DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 	if (error != 0)
 		return (error);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
 	/*
 	 * This transaction does not produce any dirty data or log blocks, so
 	 * it should not be throttled.  All other cases wait for TXG sync, by
 	 * which time the log block we are writing will be obsolete, so we can
 	 * skip waiting and just return error here instead.
 	 */
 	if (dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE) != 0) {
 		dmu_tx_abort(tx);
 		/* Make zl_get_data do txg_waited_synced() */
 		return (SET_ERROR(EIO));
 	}
 
 	/*
 	 * In order to prevent the zgd's lwb from being free'd prior to
 	 * dmu_sync_late_arrival_done() being called, we have to ensure
 	 * the lwb's "max txg" takes this tx's txg into account.
 	 */
 	zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = NULL;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = tx;
 
 	/*
 	 * Since we are currently syncing this txg, it's nontrivial to
 	 * determine what BP to nopwrite against, so we disable nopwrite.
 	 *
 	 * When syncing, the db_blkptr is initially the BP of the previous
 	 * txg.  We can not nopwrite against it because it will be changed
 	 * (this is similar to the non-late-arrival case where the dbuf is
 	 * dirty in a future txg).
 	 *
 	 * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
 	 * We can not nopwrite against it because although the BP will not
 	 * (typically) be changed, the data has not yet been persisted to this
 	 * location.
 	 *
 	 * Finally, when dbuf_write_done() is called, it is theoretically
 	 * possible to always nopwrite, because the data that was written in
 	 * this txg is the same data that we are trying to write.  However we
 	 * would need to check that this dbuf is not dirty in any future
 	 * txg's (as we do in the normal dmu_sync() path). For simplicity, we
 	 * don't nopwrite in this case.
 	 */
 	zp->zp_nopwrite = B_FALSE;
 
 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
 	    abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
 	    zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
 	    dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done,
 	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
 
 	return (0);
 }
 
 /*
  * Intent log support: sync the block associated with db to disk.
  * N.B. and XXX: the caller is responsible for making sure that the
  * data isn't changing while dmu_sync() is writing it.
  *
  * Return values:
  *
  *	EEXIST: this txg has already been synced, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	EALREADY: this block is already in the process of being synced.
  *		The caller should track its progress (somehow).
  *
  *	EIO: could not do the I/O.
  *		The caller should do a txg_wait_synced().
  *
  *	0: the I/O has been initiated.
  *		The caller should log this blkptr in the done callback.
  *		It is possible that the I/O will fail, in which case
  *		the error will be reported to the done callback and
  *		propagated to pio from zio_done().
  */
 int
 dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
 	objset_t *os = db->db_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	dbuf_dirty_record_t *dr, *dr_next;
 	dmu_sync_arg_t *dsa;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	dnode_t *dn;
 
 	ASSERT(pio != NULL);
 	ASSERT(txg != 0);
 
 	SET_BOOKMARK(&zb, ds->ds_object,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
 	DB_DNODE_EXIT(db);
 
 	/*
 	 * If we're frozen (running ziltest), we always need to generate a bp.
 	 */
 	if (txg > spa_freeze_txg(os->os_spa))
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 
 	/*
 	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
 	 * and us.  If we determine that this txg is not yet syncing,
 	 * but it begins to sync a moment later, that's OK because the
 	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
 	 */
 	mutex_enter(&db->db_mtx);
 
 	if (txg <= spa_last_synced_txg(os->os_spa)) {
 		/*
 		 * This txg has already synced.  There's nothing to do.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(EEXIST));
 	}
 
 	if (txg <= spa_syncing_txg(os->os_spa)) {
 		/*
 		 * This txg is currently syncing, so we can't mess with
 		 * the dirty record anymore; just write a new log block.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 	}
 
 	dr = dbuf_find_dirty_eq(db, txg);
 
 	if (dr == NULL) {
 		/*
 		 * There's no dr for this dbuf, so it must have been freed.
 		 * There's no need to log writes to freed blocks, so we're done.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(ENOENT));
 	}
 
 	dr_next = list_next(&db->db_dirty_records, dr);
 	ASSERT(dr_next == NULL || dr_next->dr_txg < txg);
 
 	if (db->db_blkptr != NULL) {
 		/*
 		 * We need to fill in zgd_bp with the current blkptr so that
 		 * the nopwrite code can check if we're writing the same
 		 * data that's already on disk.  We can only nopwrite if we
 		 * are sure that after making the copy, db_blkptr will not
 		 * change until our i/o completes.  We ensure this by
 		 * holding the db_mtx, and only allowing nopwrite if the
 		 * block is not already dirty (see below).  This is verified
 		 * by dmu_sync_done(), which VERIFYs that the db_blkptr has
 		 * not changed.
 		 */
 		*zgd->zgd_bp = *db->db_blkptr;
 	}
 
 	/*
 	 * Assume the on-disk data is X, the current syncing data (in
 	 * txg - 1) is Y, and the current in-memory data is Z (currently
 	 * in dmu_sync).
 	 *
 	 * We usually want to perform a nopwrite if X and Z are the
 	 * same.  However, if Y is different (i.e. the BP is going to
 	 * change before this write takes effect), then a nopwrite will
 	 * be incorrect - we would override with X, which could have
 	 * been freed when Y was written.
 	 *
 	 * (Note that this is not a concern when we are nop-writing from
 	 * syncing context, because X and Y must be identical, because
 	 * all previous txgs have been synced.)
 	 *
 	 * Therefore, we disable nopwrite if the current BP could change
 	 * before this TXG.  There are two ways it could change: by
 	 * being dirty (dr_next is non-NULL), or by being freed
 	 * (dnode_block_freed()).  This behavior is verified by
 	 * zio_done(), which VERIFYs that the override BP is identical
 	 * to the on-disk BP.
 	 */
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if (dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
 		zp.zp_nopwrite = B_FALSE;
 	DB_DNODE_EXIT(db);
 
 	ASSERT(dr->dr_txg == txg);
 	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
 		 * We have already issued a sync write for this buffer,
 		 * or this buffer has already been synced.  It could not
 		 * have been dirtied since, or we would have cleared the state.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(EALREADY));
 	}
 
 	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
 	mutex_exit(&db->db_mtx);
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = dr;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = NULL;
 
 	zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
 	    dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db),
 	    &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa,
 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
 
 	return (0);
 }
 
 int
 dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	err = dnode_set_nlevels(dn, nlevels, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	err = dnode_set_blksz(dn, size, ibs, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 void
 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/*
 	 * Send streams include each object's checksum function.  This
 	 * check ensures that the receiving system can understand the
 	 * checksum function transmitted.
 	 */
 	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
 
 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
 	dn->dn_checksum = checksum;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 void
 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/*
 	 * Send streams include each object's compression function.  This
 	 * check ensures that the receiving system can understand the
 	 * compression function transmitted.
 	 */
 	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
 
 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	dn->dn_compress = compress;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 /*
  * When the "redundant_metadata" property is set to "most", only indirect
  * blocks of this level and higher will have an additional ditto block.
  */
 static const int zfs_redundant_metadata_most_ditto_level = 2;
 
 void
 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 {
 	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
 	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
 	    (wp & WP_SPILL));
 	enum zio_checksum checksum = os->os_checksum;
 	enum zio_compress compress = os->os_compress;
 	uint8_t complevel = os->os_complevel;
 	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
 	boolean_t dedup = B_FALSE;
 	boolean_t nopwrite = B_FALSE;
 	boolean_t dedup_verify = os->os_dedup_verify;
 	boolean_t encrypt = B_FALSE;
 	int copies = os->os_copies;
 
 	/*
 	 * We maintain different write policies for each of the following
 	 * types of data:
 	 *	 1. metadata
 	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
 	 *	 3. all other level 0 blocks
 	 */
 	if (ismd) {
 		/*
 		 * XXX -- we should design a compression algorithm
 		 * that specializes in arrays of bps.
 		 */
 		compress = zio_compress_select(os->os_spa,
 		    ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
 
 		/*
 		 * Metadata always gets checksummed.  If the data
 		 * checksum is multi-bit correctable, and it's not a
 		 * ZBT-style checksum, then it's suitable for metadata
 		 * as well.  Otherwise, the metadata checksum defaults
 		 * to fletcher4.
 		 */
 		if (!(zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_METADATA) ||
 		    (zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_EMBEDDED))
 			checksum = ZIO_CHECKSUM_FLETCHER_4;
 
 		switch (os->os_redundant_metadata) {
 		case ZFS_REDUNDANT_METADATA_ALL:
 			copies++;
 			break;
 		case ZFS_REDUNDANT_METADATA_MOST:
 			if (level >= zfs_redundant_metadata_most_ditto_level ||
 			    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
 				copies++;
 			break;
 		case ZFS_REDUNDANT_METADATA_SOME:
 			if (DMU_OT_IS_CRITICAL(type))
 				copies++;
 			break;
 		case ZFS_REDUNDANT_METADATA_NONE:
 			break;
 		}
 	} else if (wp & WP_NOFILL) {
 		ASSERT(level == 0);
 
 		/*
 		 * If we're writing preallocated blocks, we aren't actually
 		 * writing them so don't set any policy properties.  These
 		 * blocks are currently only used by an external subsystem
 		 * outside of zfs (i.e. dump) and not written by the zio
 		 * pipeline.
 		 */
 		compress = ZIO_COMPRESS_OFF;
 		checksum = ZIO_CHECKSUM_OFF;
 	} else {
 		compress = zio_compress_select(os->os_spa, dn->dn_compress,
 		    compress);
 		complevel = zio_complevel_select(os->os_spa, compress,
 		    complevel, complevel);
 
 		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
 		    zio_checksum_select(dn->dn_checksum, checksum) :
 		    dedup_checksum;
 
 		/*
 		 * Determine dedup setting.  If we are in dmu_sync(),
 		 * we won't actually dedup now because that's all
 		 * done in syncing context; but we do want to use the
 		 * dedup checksum.  If the checksum is not strong
 		 * enough to ensure unique signatures, force
 		 * dedup_verify.
 		 */
 		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
 			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
 			if (!(zio_checksum_table[checksum].ci_flags &
 			    ZCHECKSUM_FLAG_DEDUP))
 				dedup_verify = B_TRUE;
 		}
 
 		/*
 		 * Enable nopwrite if we have secure enough checksum
 		 * algorithm (see comment in zio_nop_write) and
 		 * compression is enabled.  We don't enable nopwrite if
 		 * dedup is enabled as the two features are mutually
 		 * exclusive.
 		 */
 		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_NOPWRITE) &&
 		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
 	}
 
 	/*
 	 * All objects in an encrypted objset are protected from modification
 	 * via a MAC. Encrypted objects store their IV and salt in the last DVA
 	 * in the bp, so we cannot use all copies. Encrypted objects are also
 	 * not subject to nopwrite since writing the same data will still
 	 * result in a new ciphertext. Only encrypted blocks can be dedup'd
 	 * to avoid ambiguity in the dedup code since the DDT does not store
 	 * object types.
 	 */
 	if (os->os_encrypted && (wp & WP_NOFILL) == 0) {
 		encrypt = B_TRUE;
 
 		if (DMU_OT_IS_ENCRYPTED(type)) {
 			copies = MIN(copies, SPA_DVAS_PER_BP - 1);
 			nopwrite = B_FALSE;
 		} else {
 			dedup = B_FALSE;
 		}
 
 		if (level <= 0 &&
 		    (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)) {
 			compress = ZIO_COMPRESS_EMPTY;
 		}
 	}
 
 	zp->zp_compress = compress;
 	zp->zp_complevel = complevel;
 	zp->zp_checksum = checksum;
 	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
 	zp->zp_level = level;
 	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
 	zp->zp_dedup = dedup;
 	zp->zp_dedup_verify = dedup && dedup_verify;
 	zp->zp_nopwrite = nopwrite;
 	zp->zp_encrypt = encrypt;
 	zp->zp_byteorder = ZFS_HOST_BYTEORDER;
 	memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN);
 	memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN);
 	memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN);
 	zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
 	    os->os_zpl_special_smallblock : 0;
 
 	ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
 }
 
 /*
  * Reports the location of data and holes in an object.  In order to
  * accurately report holes all dirty data must be synced to disk.  This
  * causes extremely poor performance when seeking for holes in a dirty file.
  * As a compromise, only provide hole data when the dnode is clean.  When
  * a dnode is dirty report the dnode as having no holes by returning EBUSY
  * which is always safe to do.
  */
 int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
 	dnode_t *dn;
 	int restarted = 0, err;
 
 restart:
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	if (dnode_is_dirty(dn)) {
 		/*
 		 * If the zfs_dmu_offset_next_sync module option is enabled
 		 * then hole reporting has been requested.  Dirty dnodes
 		 * must be synced to disk to accurately report holes.
 		 *
 		 * Provided a RL_READER rangelock spanning 0-UINT64_MAX is
 		 * held by the caller only a single restart will be required.
 		 * We tolerate callers which do not hold the rangelock by
 		 * returning EBUSY and not reporting holes after one restart.
 		 */
 		if (zfs_dmu_offset_next_sync) {
 			rw_exit(&dn->dn_struct_rwlock);
 			dnode_rele(dn, FTAG);
 
 			if (restarted)
 				return (SET_ERROR(EBUSY));
 
 			txg_wait_synced(dmu_objset_pool(os), 0);
 			restarted = 1;
 			goto restart;
 		}
 
 		err = SET_ERROR(EBUSY);
 	} else {
 		err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK |
 		    (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
     blkptr_t *bps, size_t *nbpsp)
 {
 	dmu_buf_t **dbp, *dbuf;
 	dmu_buf_impl_t *db;
 	blkptr_t *bp;
 	int error, numbufs;
 
 	error = dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
 	    &numbufs, &dbp);
 	if (error != 0) {
 		if (error == ESRCH) {
 			error = SET_ERROR(ENXIO);
 		}
 		return (error);
 	}
 
 	ASSERT3U(numbufs, <=, *nbpsp);
 
 	for (int i = 0; i < numbufs; i++) {
 		dbuf = dbp[i];
 		db = (dmu_buf_impl_t *)dbuf;
 
 		mutex_enter(&db->db_mtx);
 
 		if (!list_is_empty(&db->db_dirty_records)) {
 			dbuf_dirty_record_t *dr;
 
 			dr = list_head(&db->db_dirty_records);
 			if (dr->dt.dl.dr_brtwrite) {
 				/*
 				 * This is very special case where we clone a
 				 * block and in the same transaction group we
 				 * read its BP (most likely to clone the clone).
 				 */
 				bp = &dr->dt.dl.dr_overridden_by;
 			} else {
 				/*
 				 * The block was modified in the same
 				 * transaction group.
 				 */
 				mutex_exit(&db->db_mtx);
 				error = SET_ERROR(EAGAIN);
 				goto out;
 			}
 		} else {
 			bp = db->db_blkptr;
 		}
 
 		mutex_exit(&db->db_mtx);
 
 		if (bp == NULL) {
 			/*
 			 * The block was created in this transaction group,
 			 * so it has no BP yet.
 			 */
 			error = SET_ERROR(EAGAIN);
 			goto out;
 		}
 		/*
 		 * Make sure we clone only data blocks.
 		 */
 		if (BP_IS_METADATA(bp) && !BP_IS_HOLE(bp)) {
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		/*
 		 * If the block was allocated in transaction group that is not
 		 * yet synced, we could clone it, but we couldn't write this
 		 * operation into ZIL, or it may be impossible to replay, since
 		 * the block may appear not yet allocated at that point.
 		 */
-		if (BP_PHYSICAL_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
+		if (BP_GET_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
-		if (BP_PHYSICAL_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) {
+		if (BP_GET_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) {
 			error = SET_ERROR(EAGAIN);
 			goto out;
 		}
 
 		bps[i] = *bp;
 	}
 
 	*nbpsp = numbufs;
 out:
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (error);
 }
 
 int
 dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
     dmu_tx_t *tx, const blkptr_t *bps, size_t nbps)
 {
 	spa_t *spa;
 	dmu_buf_t **dbp, *dbuf;
 	dmu_buf_impl_t *db;
 	struct dirty_leaf *dl;
 	dbuf_dirty_record_t *dr;
 	const blkptr_t *bp;
 	int error = 0, i, numbufs;
 
 	spa = os->os_spa;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
 	    &numbufs, &dbp));
 	ASSERT3U(nbps, ==, numbufs);
 
 	/*
 	 * Before we start cloning make sure that the dbufs sizes match new BPs
 	 * sizes. If they don't, that's a no-go, as we are not able to shrink
 	 * dbufs.
 	 */
 	for (i = 0; i < numbufs; i++) {
 		dbuf = dbp[i];
 		db = (dmu_buf_impl_t *)dbuf;
 		bp = &bps[i];
 
 		ASSERT0(db->db_level);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(db->db_blkid != DMU_SPILL_BLKID);
 
 		if (!BP_IS_HOLE(bp) && BP_GET_LSIZE(bp) != dbuf->db_size) {
 			error = SET_ERROR(EXDEV);
 			goto out;
 		}
 	}
 
 	for (i = 0; i < numbufs; i++) {
 		dbuf = dbp[i];
 		db = (dmu_buf_impl_t *)dbuf;
 		bp = &bps[i];
 
 		ASSERT0(db->db_level);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(db->db_blkid != DMU_SPILL_BLKID);
 		ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp));
 
 		dmu_buf_will_clone(dbuf, tx);
 
 		mutex_enter(&db->db_mtx);
 
 		dr = list_head(&db->db_dirty_records);
 		VERIFY(dr != NULL);
 		ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 		dl = &dr->dt.dl;
 		dl->dr_overridden_by = *bp;
 		dl->dr_brtwrite = B_TRUE;
 		dl->dr_override_state = DR_OVERRIDDEN;
 		if (BP_IS_HOLE(bp)) {
-			dl->dr_overridden_by.blk_birth = 0;
-			dl->dr_overridden_by.blk_phys_birth = 0;
+			BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, 0);
+			BP_SET_PHYSICAL_BIRTH(&dl->dr_overridden_by, 0);
 		} else {
-			dl->dr_overridden_by.blk_birth = dr->dr_txg;
+			BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by,
+			    dr->dr_txg);
 			if (!BP_IS_EMBEDDED(bp)) {
-				dl->dr_overridden_by.blk_phys_birth =
-				    BP_PHYSICAL_BIRTH(bp);
+				BP_SET_PHYSICAL_BIRTH(&dl->dr_overridden_by,
+				    BP_GET_BIRTH(bp));
 			}
 		}
 
 		mutex_exit(&db->db_mtx);
 
 		/*
 		 * When data in embedded into BP there is no need to create
 		 * BRT entry as there is no data block. Just copy the BP as
 		 * it contains the data.
 		 */
 		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
 			brt_pending_add(spa, bp, tx);
 		}
 	}
 out:
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (error);
 }
 
 void
 __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
 	dnode_phys_t *dnp = dn->dn_phys;
 
 	doi->doi_data_block_size = dn->dn_datablksz;
 	doi->doi_metadata_block_size = dn->dn_indblkshift ?
 	    1ULL << dn->dn_indblkshift : 0;
 	doi->doi_type = dn->dn_type;
 	doi->doi_bonus_type = dn->dn_bonustype;
 	doi->doi_bonus_size = dn->dn_bonuslen;
 	doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
 	doi->doi_indirection = dn->dn_nlevels;
 	doi->doi_checksum = dn->dn_checksum;
 	doi->doi_compress = dn->dn_compress;
 	doi->doi_nblkptr = dn->dn_nblkptr;
 	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
 	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	doi->doi_fill_count = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
 }
 
 void
 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	mutex_enter(&dn->dn_mtx);
 
 	__dmu_object_info_from_dnode(dn, doi);
 
 	mutex_exit(&dn->dn_mtx);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /*
  * Get information on a DMU object.
  * If doi is NULL, just indicates whether the object exists.
  */
 int
 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 
 	if (err)
 		return (err);
 
 	if (doi != NULL)
 		dmu_object_info_from_dnode(dn, doi);
 
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 /*
  * As above, but faster; can be used when you have a held dbuf in hand.
  */
 void
 dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	DB_DNODE_ENTER(db);
 	dmu_object_info_from_dnode(DB_DNODE(db), doi);
 	DB_DNODE_EXIT(db);
 }
 
 /*
  * Faster still when you only care about the size.
  */
 void
 dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
     u_longlong_t *nblk512)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	*blksize = dn->dn_datablksz;
 	/* add in number of slots used for the dnode itself */
 	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
 	    SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
 	DB_DNODE_EXIT(db);
 }
 
 void
 dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	*dnsize = dn->dn_num_slots << DNODE_SHIFT;
 	DB_DNODE_EXIT(db);
 }
 
 void
 byteswap_uint64_array(void *vbuf, size_t size)
 {
 	uint64_t *buf = vbuf;
 	size_t count = size >> 3;
 	int i;
 
 	ASSERT((size & 7) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_64(buf[i]);
 }
 
 void
 byteswap_uint32_array(void *vbuf, size_t size)
 {
 	uint32_t *buf = vbuf;
 	size_t count = size >> 2;
 	int i;
 
 	ASSERT((size & 3) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_32(buf[i]);
 }
 
 void
 byteswap_uint16_array(void *vbuf, size_t size)
 {
 	uint16_t *buf = vbuf;
 	size_t count = size >> 1;
 	int i;
 
 	ASSERT((size & 1) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_16(buf[i]);
 }
 
 void
 byteswap_uint8_array(void *vbuf, size_t size)
 {
 	(void) vbuf, (void) size;
 }
 
 void
 dmu_init(void)
 {
 	abd_init();
 	zfs_dbgmsg_init();
 	sa_cache_init();
 	dmu_objset_init();
 	dnode_init();
 	zfetch_init();
 	dmu_tx_init();
 	l2arc_init();
 	arc_init();
 	dbuf_init();
 }
 
 void
 dmu_fini(void)
 {
 	arc_fini(); /* arc depends on l2arc, so arc must go first */
 	l2arc_fini();
 	dmu_tx_fini();
 	zfetch_fini();
 	dbuf_fini();
 	dnode_fini();
 	dmu_objset_fini();
 	sa_cache_fini();
 	zfs_dbgmsg_fini();
 	abd_fini();
 }
 
 EXPORT_SYMBOL(dmu_bonus_hold);
 EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
 EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
 EXPORT_SYMBOL(dmu_buf_rele_array);
 EXPORT_SYMBOL(dmu_prefetch);
 EXPORT_SYMBOL(dmu_prefetch_by_dnode);
 EXPORT_SYMBOL(dmu_prefetch_dnode);
 EXPORT_SYMBOL(dmu_free_range);
 EXPORT_SYMBOL(dmu_free_long_range);
 EXPORT_SYMBOL(dmu_free_long_object);
 EXPORT_SYMBOL(dmu_read);
 EXPORT_SYMBOL(dmu_read_by_dnode);
 EXPORT_SYMBOL(dmu_write);
 EXPORT_SYMBOL(dmu_write_by_dnode);
 EXPORT_SYMBOL(dmu_prealloc);
 EXPORT_SYMBOL(dmu_object_info);
 EXPORT_SYMBOL(dmu_object_info_from_dnode);
 EXPORT_SYMBOL(dmu_object_info_from_db);
 EXPORT_SYMBOL(dmu_object_size_from_db);
 EXPORT_SYMBOL(dmu_object_dnsize_from_db);
 EXPORT_SYMBOL(dmu_object_set_nlevels);
 EXPORT_SYMBOL(dmu_object_set_blocksize);
 EXPORT_SYMBOL(dmu_object_set_maxblkid);
 EXPORT_SYMBOL(dmu_object_set_checksum);
 EXPORT_SYMBOL(dmu_object_set_compress);
 EXPORT_SYMBOL(dmu_offset_next);
 EXPORT_SYMBOL(dmu_write_policy);
 EXPORT_SYMBOL(dmu_sync);
 EXPORT_SYMBOL(dmu_request_arcbuf);
 EXPORT_SYMBOL(dmu_return_arcbuf);
 EXPORT_SYMBOL(dmu_assign_arcbuf_by_dnode);
 EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf);
 EXPORT_SYMBOL(dmu_buf_hold);
 EXPORT_SYMBOL(dmu_ot);
 
 ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW,
 	"Enable NOP writes");
 
 ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, UINT, ZMOD_RW,
 	"Percentage of dirtied blocks from frees in one TXG");
 
 ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
 	"Enable forcing txg sync to find holes");
 
 /* CSTYLED */
 ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW,
 	"Limit one prefetch call to this size");
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 9f1c25f866f7..680aed4513bc 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -1,3806 +1,3809 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright 2014 HybridCluster. All rights reserved.
  * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2019 Datto Inc.
  * Copyright (c) 2022 Axcient.
  */
 
 #include <sys/arc.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_recv.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zvol.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_znode.h>
 #include <zfs_fletcher.h>
 #include <sys/avl.h>
 #include <sys/ddt.h>
 #include <sys/zfs_onexit.h>
 #include <sys/dsl_destroy.h>
 #include <sys/blkptr.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/zfeature.h>
 #include <sys/bqueue.h>
 #include <sys/objlist.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
 #include <sys/zfs_file.h>
 
 static uint_t zfs_recv_queue_length = SPA_MAXBLOCKSIZE;
 static uint_t zfs_recv_queue_ff = 20;
 static uint_t zfs_recv_write_batch_size = 1024 * 1024;
 static int zfs_recv_best_effort_corrective = 0;
 
 static const void *const dmu_recv_tag = "dmu_recv_tag";
 const char *const recv_clone_name = "%recv";
 
 typedef enum {
 	ORNS_NO,
 	ORNS_YES,
 	ORNS_MAYBE
 } or_need_sync_t;
 
 static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len,
     void *buf);
 
 struct receive_record_arg {
 	dmu_replay_record_t header;
 	void *payload; /* Pointer to a buffer containing the payload */
 	/*
 	 * If the record is a WRITE or SPILL, pointer to the abd containing the
 	 * payload.
 	 */
 	abd_t *abd;
 	int payload_size;
 	uint64_t bytes_read; /* bytes read from stream when record created */
 	boolean_t eos_marker; /* Marks the end of the stream */
 	bqueue_node_t node;
 };
 
 struct receive_writer_arg {
 	objset_t *os;
 	boolean_t byteswap;
 	bqueue_t q;
 
 	/*
 	 * These three members are used to signal to the main thread when
 	 * we're done.
 	 */
 	kmutex_t mutex;
 	kcondvar_t cv;
 	boolean_t done;
 
 	int err;
 	const char *tofs;
 	boolean_t heal;
 	boolean_t resumable;
 	boolean_t raw;   /* DMU_BACKUP_FEATURE_RAW set */
 	boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */
 	boolean_t full;  /* this is a full send stream */
 	uint64_t last_object;
 	uint64_t last_offset;
 	uint64_t max_object; /* highest object ID referenced in stream */
 	uint64_t bytes_read; /* bytes read when current record created */
 
 	list_t write_batch;
 
 	/* Encryption parameters for the last received DRR_OBJECT_RANGE */
 	boolean_t or_crypt_params_present;
 	uint64_t or_firstobj;
 	uint64_t or_numslots;
 	uint8_t or_salt[ZIO_DATA_SALT_LEN];
 	uint8_t or_iv[ZIO_DATA_IV_LEN];
 	uint8_t or_mac[ZIO_DATA_MAC_LEN];
 	boolean_t or_byteorder;
 	zio_t *heal_pio;
 
 	/* Keep track of DRR_FREEOBJECTS right after DRR_OBJECT_RANGE */
 	or_need_sync_t or_need_sync;
 };
 
 typedef struct dmu_recv_begin_arg {
 	const char *drba_origin;
 	dmu_recv_cookie_t *drba_cookie;
 	cred_t *drba_cred;
 	proc_t *drba_proc;
 	dsl_crypto_params_t *drba_dcp;
 } dmu_recv_begin_arg_t;
 
 static void
 byteswap_record(dmu_replay_record_t *drr)
 {
 #define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
 #define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
 	drr->drr_type = BSWAP_32(drr->drr_type);
 	drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
 
 	switch (drr->drr_type) {
 	case DRR_BEGIN:
 		DO64(drr_begin.drr_magic);
 		DO64(drr_begin.drr_versioninfo);
 		DO64(drr_begin.drr_creation_time);
 		DO32(drr_begin.drr_type);
 		DO32(drr_begin.drr_flags);
 		DO64(drr_begin.drr_toguid);
 		DO64(drr_begin.drr_fromguid);
 		break;
 	case DRR_OBJECT:
 		DO64(drr_object.drr_object);
 		DO32(drr_object.drr_type);
 		DO32(drr_object.drr_bonustype);
 		DO32(drr_object.drr_blksz);
 		DO32(drr_object.drr_bonuslen);
 		DO32(drr_object.drr_raw_bonuslen);
 		DO64(drr_object.drr_toguid);
 		DO64(drr_object.drr_maxblkid);
 		break;
 	case DRR_FREEOBJECTS:
 		DO64(drr_freeobjects.drr_firstobj);
 		DO64(drr_freeobjects.drr_numobjs);
 		DO64(drr_freeobjects.drr_toguid);
 		break;
 	case DRR_WRITE:
 		DO64(drr_write.drr_object);
 		DO32(drr_write.drr_type);
 		DO64(drr_write.drr_offset);
 		DO64(drr_write.drr_logical_size);
 		DO64(drr_write.drr_toguid);
 		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
 		DO64(drr_write.drr_key.ddk_prop);
 		DO64(drr_write.drr_compressed_size);
 		break;
 	case DRR_WRITE_EMBEDDED:
 		DO64(drr_write_embedded.drr_object);
 		DO64(drr_write_embedded.drr_offset);
 		DO64(drr_write_embedded.drr_length);
 		DO64(drr_write_embedded.drr_toguid);
 		DO32(drr_write_embedded.drr_lsize);
 		DO32(drr_write_embedded.drr_psize);
 		break;
 	case DRR_FREE:
 		DO64(drr_free.drr_object);
 		DO64(drr_free.drr_offset);
 		DO64(drr_free.drr_length);
 		DO64(drr_free.drr_toguid);
 		break;
 	case DRR_SPILL:
 		DO64(drr_spill.drr_object);
 		DO64(drr_spill.drr_length);
 		DO64(drr_spill.drr_toguid);
 		DO64(drr_spill.drr_compressed_size);
 		DO32(drr_spill.drr_type);
 		break;
 	case DRR_OBJECT_RANGE:
 		DO64(drr_object_range.drr_firstobj);
 		DO64(drr_object_range.drr_numslots);
 		DO64(drr_object_range.drr_toguid);
 		break;
 	case DRR_REDACT:
 		DO64(drr_redact.drr_object);
 		DO64(drr_redact.drr_offset);
 		DO64(drr_redact.drr_length);
 		DO64(drr_redact.drr_toguid);
 		break;
 	case DRR_END:
 		DO64(drr_end.drr_toguid);
 		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum);
 		break;
 	default:
 		break;
 	}
 
 	if (drr->drr_type != DRR_BEGIN) {
 		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum);
 	}
 
 #undef DO64
 #undef DO32
 }
 
 static boolean_t
 redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid)
 {
 	for (int i = 0; i < num_snaps; i++) {
 		if (snaps[i] == guid)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Check that the new stream we're trying to receive is redacted with respect to
  * a subset of the snapshots that the origin was redacted with respect to.  For
  * the reasons behind this, see the man page on redacted zfs sends and receives.
  */
 static boolean_t
 compatible_redact_snaps(uint64_t *origin_snaps, uint64_t origin_num_snaps,
     uint64_t *redact_snaps, uint64_t num_redact_snaps)
 {
 	/*
 	 * Short circuit the comparison; if we are redacted with respect to
 	 * more snapshots than the origin, we can't be redacted with respect
 	 * to a subset.
 	 */
 	if (num_redact_snaps > origin_num_snaps) {
 		return (B_FALSE);
 	}
 
 	for (int i = 0; i < num_redact_snaps; i++) {
 		if (!redact_snaps_contains(origin_snaps, origin_num_snaps,
 		    redact_snaps[i])) {
 			return (B_FALSE);
 		}
 	}
 	return (B_TRUE);
 }
 
 static boolean_t
 redact_check(dmu_recv_begin_arg_t *drba, dsl_dataset_t *origin)
 {
 	uint64_t *origin_snaps;
 	uint64_t origin_num_snaps;
 	dmu_recv_cookie_t *drc = drba->drba_cookie;
 	struct drr_begin *drrb = drc->drc_drrb;
 	int featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 	int err = 0;
 	boolean_t ret = B_TRUE;
 	uint64_t *redact_snaps;
 	uint_t numredactsnaps;
 
 	/*
 	 * If this is a full send stream, we're safe no matter what.
 	 */
 	if (drrb->drr_fromguid == 0)
 		return (ret);
 
 	VERIFY(dsl_dataset_get_uint64_array_feature(origin,
 	    SPA_FEATURE_REDACTED_DATASETS, &origin_num_snaps, &origin_snaps));
 
 	if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
 	    BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, &numredactsnaps) ==
 	    0) {
 		/*
 		 * If the send stream was sent from the redaction bookmark or
 		 * the redacted version of the dataset, then we're safe.  Verify
 		 * that this is from the a compatible redaction bookmark or
 		 * redacted dataset.
 		 */
 		if (!compatible_redact_snaps(origin_snaps, origin_num_snaps,
 		    redact_snaps, numredactsnaps)) {
 			err = EINVAL;
 		}
 	} else if (featureflags & DMU_BACKUP_FEATURE_REDACTED) {
 		/*
 		 * If the stream is redacted, it must be redacted with respect
 		 * to a subset of what the origin is redacted with respect to.
 		 * See case number 2 in the zfs man page section on redacted zfs
 		 * send.
 		 */
 		err = nvlist_lookup_uint64_array(drc->drc_begin_nvl,
 		    BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps);
 
 		if (err != 0 || !compatible_redact_snaps(origin_snaps,
 		    origin_num_snaps, redact_snaps, numredactsnaps)) {
 			err = EINVAL;
 		}
 	} else if (!redact_snaps_contains(origin_snaps, origin_num_snaps,
 	    drrb->drr_toguid)) {
 		/*
 		 * If the stream isn't redacted but the origin is, this must be
 		 * one of the snapshots the origin is redacted with respect to.
 		 * See case number 1 in the zfs man page section on redacted zfs
 		 * send.
 		 */
 		err = EINVAL;
 	}
 
 	if (err != 0)
 		ret = B_FALSE;
 	return (ret);
 }
 
 /*
  * If we previously received a stream with --large-block, we don't support
  * receiving an incremental on top of it without --large-block.  This avoids
  * forcing a read-modify-write or trying to re-aggregate a string of WRITE
  * records.
  */
 static int
 recv_check_large_blocks(dsl_dataset_t *ds, uint64_t featureflags)
 {
 	if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_LARGE_BLOCKS) &&
 	    !(featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS))
 		return (SET_ERROR(ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH));
 	return (0);
 }
 
 static int
 recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
     uint64_t fromguid, uint64_t featureflags)
 {
 	uint64_t obj;
 	uint64_t children;
 	int error;
 	dsl_dataset_t *snap;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0;
 	boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0;
 	boolean_t embed = (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) != 0;
 
 	/* Temporary clone name must not exist. */
 	error = zap_lookup(dp->dp_meta_objset,
 	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
 	    8, 1, &obj);
 	if (error != ENOENT)
 		return (error == 0 ? SET_ERROR(EBUSY) : error);
 
 	/* Resume state must not be set. */
 	if (dsl_dataset_has_resume_receive_state(ds))
 		return (SET_ERROR(EBUSY));
 
 	/* New snapshot name must not exist if we're not healing it. */
 	error = zap_lookup(dp->dp_meta_objset,
 	    dsl_dataset_phys(ds)->ds_snapnames_zapobj,
 	    drba->drba_cookie->drc_tosnap, 8, 1, &obj);
 	if (drba->drba_cookie->drc_heal) {
 		if (error != 0)
 			return (error);
 	} else if (error != ENOENT) {
 		return (error == 0 ? SET_ERROR(EEXIST) : error);
 	}
 
 	/* Must not have children if receiving a ZVOL. */
 	error = zap_count(dp->dp_meta_objset,
 	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children);
 	if (error != 0)
 		return (error);
 	if (drba->drba_cookie->drc_drrb->drr_type != DMU_OST_ZFS &&
 	    children > 0)
 		return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
 
 	/*
 	 * Check snapshot limit before receiving. We'll recheck again at the
 	 * end, but might as well abort before receiving if we're already over
 	 * the limit.
 	 *
 	 * Note that we do not check the file system limit with
 	 * dsl_dir_fscount_check because the temporary %clones don't count
 	 * against that limit.
 	 */
 	error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
 	    NULL, drba->drba_cred, drba->drba_proc);
 	if (error != 0)
 		return (error);
 
 	if (drba->drba_cookie->drc_heal) {
 		/* Encryption is incompatible with embedded data. */
 		if (encrypted && embed)
 			return (SET_ERROR(EINVAL));
 
 		/* Healing is not supported when in 'force' mode. */
 		if (drba->drba_cookie->drc_force)
 			return (SET_ERROR(EINVAL));
 
 		/* Must have keys loaded if doing encrypted non-raw recv. */
 		if (encrypted && !raw) {
 			if (spa_keystore_lookup_key(dp->dp_spa, ds->ds_object,
 			    NULL, NULL) != 0)
 				return (SET_ERROR(EACCES));
 		}
 
 		error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap);
 		if (error != 0)
 			return (error);
 
 		/*
 		 * When not doing best effort corrective recv healing can only
 		 * be done if the send stream is for the same snapshot as the
 		 * one we are trying to heal.
 		 */
 		if (zfs_recv_best_effort_corrective == 0 &&
 		    drba->drba_cookie->drc_drrb->drr_toguid !=
 		    dsl_dataset_phys(snap)->ds_guid) {
 			dsl_dataset_rele(snap, FTAG);
 			return (SET_ERROR(ENOTSUP));
 		}
 		dsl_dataset_rele(snap, FTAG);
 	} else if (fromguid != 0) {
 		/* Sanity check the incremental recv */
 		uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 
 		/* Can't perform a raw receive on top of a non-raw receive */
 		if (!encrypted && raw)
 			return (SET_ERROR(EINVAL));
 
 		/* Encryption is incompatible with embedded data */
 		if (encrypted && embed)
 			return (SET_ERROR(EINVAL));
 
 		/* Find snapshot in this dir that matches fromguid. */
 		while (obj != 0) {
 			error = dsl_dataset_hold_obj(dp, obj, FTAG,
 			    &snap);
 			if (error != 0)
 				return (SET_ERROR(ENODEV));
 			if (snap->ds_dir != ds->ds_dir) {
 				dsl_dataset_rele(snap, FTAG);
 				return (SET_ERROR(ENODEV));
 			}
 			if (dsl_dataset_phys(snap)->ds_guid == fromguid)
 				break;
 			obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 			dsl_dataset_rele(snap, FTAG);
 		}
 		if (obj == 0)
 			return (SET_ERROR(ENODEV));
 
 		if (drba->drba_cookie->drc_force) {
 			drba->drba_cookie->drc_fromsnapobj = obj;
 		} else {
 			/*
 			 * If we are not forcing, there must be no
 			 * changes since fromsnap. Raw sends have an
 			 * additional constraint that requires that
 			 * no "noop" snapshots exist between fromsnap
 			 * and tosnap for the IVset checking code to
 			 * work properly.
 			 */
 			if (dsl_dataset_modified_since_snap(ds, snap) ||
 			    (raw &&
 			    dsl_dataset_phys(ds)->ds_prev_snap_obj !=
 			    snap->ds_object)) {
 				dsl_dataset_rele(snap, FTAG);
 				return (SET_ERROR(ETXTBSY));
 			}
 			drba->drba_cookie->drc_fromsnapobj =
 			    ds->ds_prev->ds_object;
 		}
 
 		if (dsl_dataset_feature_is_active(snap,
 		    SPA_FEATURE_REDACTED_DATASETS) && !redact_check(drba,
 		    snap)) {
 			dsl_dataset_rele(snap, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		error = recv_check_large_blocks(snap, featureflags);
 		if (error != 0) {
 			dsl_dataset_rele(snap, FTAG);
 			return (error);
 		}
 
 		dsl_dataset_rele(snap, FTAG);
 	} else {
 		/* If full and not healing then must be forced. */
 		if (!drba->drba_cookie->drc_force)
 			return (SET_ERROR(EEXIST));
 
 		/*
 		 * We don't support using zfs recv -F to blow away
 		 * encrypted filesystems. This would require the
 		 * dsl dir to point to the old encryption key and
 		 * the new one at the same time during the receive.
 		 */
 		if ((!encrypted && raw) || encrypted)
 			return (SET_ERROR(EINVAL));
 
 		/*
 		 * Perform the same encryption checks we would if
 		 * we were creating a new dataset from scratch.
 		 */
 		if (!raw) {
 			boolean_t will_encrypt;
 
 			error = dmu_objset_create_crypt_check(
 			    ds->ds_dir->dd_parent, drba->drba_dcp,
 			    &will_encrypt);
 			if (error != 0)
 				return (error);
 
 			if (will_encrypt && embed)
 				return (SET_ERROR(EINVAL));
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Check that any feature flags used in the data stream we're receiving are
  * supported by the pool we are receiving into.
  *
  * Note that some of the features we explicitly check here have additional
  * (implicit) features they depend on, but those dependencies are enforced
  * through the zfeature_register() calls declaring the features that we
  * explicitly check.
  */
 static int
 recv_begin_check_feature_flags_impl(uint64_t featureflags, spa_t *spa)
 {
 	/*
 	 * Check if there are any unsupported feature flags.
 	 */
 	if (!DMU_STREAM_SUPPORTED(featureflags)) {
 		return (SET_ERROR(ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE));
 	}
 
 	/* Verify pool version supports SA if SA_SPILL feature set */
 	if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
 	    spa_version(spa) < SPA_VERSION_SA)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * LZ4 compressed, ZSTD compressed, embedded, mooched, large blocks,
 	 * and large_dnodes in the stream can only be used if those pool
 	 * features are enabled because we don't attempt to decompress /
 	 * un-embed / un-mooch / split up the blocks / dnodes during the
 	 * receive process.
 	 */
 	if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS))
 		return (SET_ERROR(ENOTSUP));
 	if ((featureflags & DMU_BACKUP_FEATURE_ZSTD) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS))
 		return (SET_ERROR(ENOTSUP));
 	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA))
 		return (SET_ERROR(ENOTSUP));
 	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
 		return (SET_ERROR(ENOTSUP));
 	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Receiving redacted streams requires that redacted datasets are
 	 * enabled.
 	 */
 	if ((featureflags & DMU_BACKUP_FEATURE_REDACTED) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_REDACTED_DATASETS))
 		return (SET_ERROR(ENOTSUP));
 
 	return (0);
 }
 
 static int
 dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
 	uint64_t fromguid = drrb->drr_fromguid;
 	int flags = drrb->drr_flags;
 	ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
 	int error;
 	uint64_t featureflags = drba->drba_cookie->drc_featureflags;
 	dsl_dataset_t *ds;
 	const char *tofs = drba->drba_cookie->drc_tofs;
 
 	/* already checked */
 	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
 	ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING));
 
 	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 	    DMU_COMPOUNDSTREAM ||
 	    drrb->drr_type >= DMU_OST_NUMTYPES ||
 	    ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
 		return (SET_ERROR(EINVAL));
 
 	error = recv_begin_check_feature_flags_impl(featureflags, dp->dp_spa);
 	if (error != 0)
 		return (error);
 
 	/* Resumable receives require extensible datasets */
 	if (drba->drba_cookie->drc_resumable &&
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET))
 		return (SET_ERROR(ENOTSUP));
 
 	if (featureflags & DMU_BACKUP_FEATURE_RAW) {
 		/* raw receives require the encryption feature */
 		if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION))
 			return (SET_ERROR(ENOTSUP));
 
 		/* embedded data is incompatible with encryption and raw recv */
 		if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
 			return (SET_ERROR(EINVAL));
 
 		/* raw receives require spill block allocation flag */
 		if (!(flags & DRR_FLAG_SPILL_BLOCK))
 			return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
 	} else {
 		/*
 		 * We support unencrypted datasets below encrypted ones now,
 		 * so add the DS_HOLD_FLAG_DECRYPT flag only if we are dealing
 		 * with a dataset we may encrypt.
 		 */
 		if (drba->drba_dcp == NULL ||
 		    drba->drba_dcp->cp_crypt != ZIO_CRYPT_OFF) {
 			dsflags |= DS_HOLD_FLAG_DECRYPT;
 		}
 	}
 
 	error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
 	if (error == 0) {
 		/* target fs already exists; recv into temp clone */
 
 		/* Can't recv a clone into an existing fs */
 		if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
 			dsl_dataset_rele_flags(ds, dsflags, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		error = recv_begin_check_existing_impl(drba, ds, fromguid,
 		    featureflags);
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 	} else if (error == ENOENT) {
 		/* target fs does not exist; must be a full backup or clone */
 		char buf[ZFS_MAX_DATASET_NAME_LEN];
 		objset_t *os;
 
 		/* healing recv must be done "into" an existing snapshot */
 		if (drba->drba_cookie->drc_heal == B_TRUE)
 			return (SET_ERROR(ENOTSUP));
 
 		/*
 		 * If it's a non-clone incremental, we are missing the
 		 * target fs, so fail the recv.
 		 */
 		if (fromguid != 0 && !((flags & DRR_FLAG_CLONE) ||
 		    drba->drba_origin))
 			return (SET_ERROR(ENOENT));
 
 		/*
 		 * If we're receiving a full send as a clone, and it doesn't
 		 * contain all the necessary free records and freeobject
 		 * records, reject it.
 		 */
 		if (fromguid == 0 && drba->drba_origin != NULL &&
 		    !(flags & DRR_FLAG_FREERECORDS))
 			return (SET_ERROR(EINVAL));
 
 		/* Open the parent of tofs */
 		ASSERT3U(strlen(tofs), <, sizeof (buf));
 		(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
 		error = dsl_dataset_hold(dp, buf, FTAG, &ds);
 		if (error != 0)
 			return (error);
 
 		if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0 &&
 		    drba->drba_origin == NULL) {
 			boolean_t will_encrypt;
 
 			/*
 			 * Check that we aren't breaking any encryption rules
 			 * and that we have all the parameters we need to
 			 * create an encrypted dataset if necessary. If we are
 			 * making an encrypted dataset the stream can't have
 			 * embedded data.
 			 */
 			error = dmu_objset_create_crypt_check(ds->ds_dir,
 			    drba->drba_dcp, &will_encrypt);
 			if (error != 0) {
 				dsl_dataset_rele(ds, FTAG);
 				return (error);
 			}
 
 			if (will_encrypt &&
 			    (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) {
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 		}
 
 		/*
 		 * Check filesystem and snapshot limits before receiving. We'll
 		 * recheck snapshot limits again at the end (we create the
 		 * filesystems and increment those counts during begin_sync).
 		 */
 		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
 		    ZFS_PROP_FILESYSTEM_LIMIT, NULL,
 		    drba->drba_cred, drba->drba_proc);
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 
 		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
 		    ZFS_PROP_SNAPSHOT_LIMIT, NULL,
 		    drba->drba_cred, drba->drba_proc);
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 
 		/* can't recv below anything but filesystems (eg. no ZVOLs) */
 		error = dmu_objset_from_ds(ds, &os);
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 		if (dmu_objset_type(os) != DMU_OST_ZFS) {
 			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
 		}
 
 		if (drba->drba_origin != NULL) {
 			dsl_dataset_t *origin;
 			error = dsl_dataset_hold_flags(dp, drba->drba_origin,
 			    dsflags, FTAG, &origin);
 			if (error != 0) {
 				dsl_dataset_rele(ds, FTAG);
 				return (error);
 			}
 			if (!origin->ds_is_snapshot) {
 				dsl_dataset_rele_flags(origin, dsflags, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 			if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
 			    fromguid != 0) {
 				dsl_dataset_rele_flags(origin, dsflags, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(ENODEV));
 			}
 
 			if (origin->ds_dir->dd_crypto_obj != 0 &&
 			    (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) {
 				dsl_dataset_rele_flags(origin, dsflags, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 
 			/*
 			 * If the origin is redacted we need to verify that this
 			 * send stream can safely be received on top of the
 			 * origin.
 			 */
 			if (dsl_dataset_feature_is_active(origin,
 			    SPA_FEATURE_REDACTED_DATASETS)) {
 				if (!redact_check(drba, origin)) {
 					dsl_dataset_rele_flags(origin, dsflags,
 					    FTAG);
 					dsl_dataset_rele_flags(ds, dsflags,
 					    FTAG);
 					return (SET_ERROR(EINVAL));
 				}
 			}
 
 			error = recv_check_large_blocks(ds, featureflags);
 			if (error != 0) {
 				dsl_dataset_rele_flags(origin, dsflags, FTAG);
 				dsl_dataset_rele_flags(ds, dsflags, FTAG);
 				return (error);
 			}
 
 			dsl_dataset_rele_flags(origin, dsflags, FTAG);
 		}
 
 		dsl_dataset_rele(ds, FTAG);
 		error = 0;
 	}
 	return (error);
 }
 
 static void
 dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 	dmu_recv_cookie_t *drc = drba->drba_cookie;
 	struct drr_begin *drrb = drc->drc_drrb;
 	const char *tofs = drc->drc_tofs;
 	uint64_t featureflags = drc->drc_featureflags;
 	dsl_dataset_t *ds, *newds;
 	objset_t *os;
 	uint64_t dsobj;
 	ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
 	int error;
 	uint64_t crflags = 0;
 	dsl_crypto_params_t dummy_dcp = { 0 };
 	dsl_crypto_params_t *dcp = drba->drba_dcp;
 
 	if (drrb->drr_flags & DRR_FLAG_CI_DATA)
 		crflags |= DS_FLAG_CI_DATASET;
 
 	if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0)
 		dsflags |= DS_HOLD_FLAG_DECRYPT;
 
 	/*
 	 * Raw, non-incremental recvs always use a dummy dcp with
 	 * the raw cmd set. Raw incremental recvs do not use a dcp
 	 * since the encryption parameters are already set in stone.
 	 */
 	if (dcp == NULL && drrb->drr_fromguid == 0 &&
 	    drba->drba_origin == NULL) {
 		ASSERT3P(dcp, ==, NULL);
 		dcp = &dummy_dcp;
 
 		if (featureflags & DMU_BACKUP_FEATURE_RAW)
 			dcp->cp_cmd = DCP_CMD_RAW_RECV;
 	}
 
 	error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
 	if (error == 0) {
 		/* Create temporary clone unless we're doing corrective recv */
 		dsl_dataset_t *snap = NULL;
 
 		if (drba->drba_cookie->drc_fromsnapobj != 0) {
 			VERIFY0(dsl_dataset_hold_obj(dp,
 			    drba->drba_cookie->drc_fromsnapobj, FTAG, &snap));
 			ASSERT3P(dcp, ==, NULL);
 		}
 		if (drc->drc_heal) {
 			/* When healing we want to use the provided snapshot */
 			VERIFY0(dsl_dataset_snap_lookup(ds, drc->drc_tosnap,
 			    &dsobj));
 		} else {
 			dsobj = dsl_dataset_create_sync(ds->ds_dir,
 			    recv_clone_name, snap, crflags, drba->drba_cred,
 			    dcp, tx);
 		}
 		if (drba->drba_cookie->drc_fromsnapobj != 0)
 			dsl_dataset_rele(snap, FTAG);
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 	} else {
 		dsl_dir_t *dd;
 		const char *tail;
 		dsl_dataset_t *origin = NULL;
 
 		VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
 
 		if (drba->drba_origin != NULL) {
 			VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
 			    FTAG, &origin));
 			ASSERT3P(dcp, ==, NULL);
 		}
 
 		/* Create new dataset. */
 		dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1,
 		    origin, crflags, drba->drba_cred, dcp, tx);
 		if (origin != NULL)
 			dsl_dataset_rele(origin, FTAG);
 		dsl_dir_rele(dd, FTAG);
 		drc->drc_newfs = B_TRUE;
 	}
 	VERIFY0(dsl_dataset_own_obj_force(dp, dsobj, dsflags, dmu_recv_tag,
 	    &newds));
 	if (dsl_dataset_feature_is_active(newds,
 	    SPA_FEATURE_REDACTED_DATASETS)) {
 		/*
 		 * If the origin dataset is redacted, the child will be redacted
 		 * when we create it.  We clear the new dataset's
 		 * redaction info; if it should be redacted, we'll fill
 		 * in its information later.
 		 */
 		dsl_dataset_deactivate_feature(newds,
 		    SPA_FEATURE_REDACTED_DATASETS, tx);
 	}
 	VERIFY0(dmu_objset_from_ds(newds, &os));
 
 	if (drc->drc_resumable) {
 		dsl_dataset_zapify(newds, tx);
 		if (drrb->drr_fromguid != 0) {
 			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID,
 			    8, 1, &drrb->drr_fromguid, tx));
 		}
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID,
 		    8, 1, &drrb->drr_toguid, tx));
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME,
 		    1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx));
 		uint64_t one = 1;
 		uint64_t zero = 0;
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT,
 		    8, 1, &one, tx));
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET,
 		    8, 1, &zero, tx));
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
 		    8, 1, &zero, tx));
 		if (featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) {
 			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK,
 			    8, 1, &one, tx));
 		}
 		if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) {
 			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
 			    8, 1, &one, tx));
 		}
 		if (featureflags & DMU_BACKUP_FEATURE_COMPRESSED) {
 			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK,
 			    8, 1, &one, tx));
 		}
 		if (featureflags & DMU_BACKUP_FEATURE_RAW) {
 			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_RAWOK,
 			    8, 1, &one, tx));
 		}
 
 		uint64_t *redact_snaps;
 		uint_t numredactsnaps;
 		if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
 		    BEGINNV_REDACT_FROM_SNAPS, &redact_snaps,
 		    &numredactsnaps) == 0) {
 			VERIFY0(zap_add(mos, dsobj,
 			    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS,
 			    sizeof (*redact_snaps), numredactsnaps,
 			    redact_snaps, tx));
 		}
 	}
 
 	/*
 	 * Usually the os->os_encrypted value is tied to the presence of a
 	 * DSL Crypto Key object in the dd. However, that will not be received
 	 * until dmu_recv_stream(), so we set the value manually for now.
 	 */
 	if (featureflags & DMU_BACKUP_FEATURE_RAW) {
 		os->os_encrypted = B_TRUE;
 		drba->drba_cookie->drc_raw = B_TRUE;
 	}
 
 	if (featureflags & DMU_BACKUP_FEATURE_REDACTED) {
 		uint64_t *redact_snaps;
 		uint_t numredactsnaps;
 		VERIFY0(nvlist_lookup_uint64_array(drc->drc_begin_nvl,
 		    BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps));
 		dsl_dataset_activate_redaction(newds, redact_snaps,
 		    numredactsnaps, tx);
 	}
 
 	dmu_buf_will_dirty(newds->ds_dbuf, tx);
 	dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
 
 	/*
 	 * If we actually created a non-clone, we need to create the objset
 	 * in our new dataset. If this is a raw send we postpone this until
 	 * dmu_recv_stream() so that we can allocate the metadnode with the
 	 * properties from the DRR_BEGIN payload.
 	 */
 	rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG);
 	if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) &&
 	    (featureflags & DMU_BACKUP_FEATURE_RAW) == 0 &&
 	    !drc->drc_heal) {
 		(void) dmu_objset_create_impl(dp->dp_spa,
 		    newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
 	}
 	rrw_exit(&newds->ds_bp_rwlock, FTAG);
 
 	drba->drba_cookie->drc_ds = newds;
 	drba->drba_cookie->drc_os = os;
 
 	spa_history_log_internal_ds(newds, "receive", tx, " ");
 }
 
 static int
 dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dmu_recv_cookie_t *drc = drba->drba_cookie;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	struct drr_begin *drrb = drc->drc_drrb;
 	int error;
 	ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
 	dsl_dataset_t *ds;
 	const char *tofs = drc->drc_tofs;
 
 	/* already checked */
 	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
 	ASSERT(drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING);
 
 	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 	    DMU_COMPOUNDSTREAM ||
 	    drrb->drr_type >= DMU_OST_NUMTYPES)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * This is mostly a sanity check since we should have already done these
 	 * checks during a previous attempt to receive the data.
 	 */
 	error = recv_begin_check_feature_flags_impl(drc->drc_featureflags,
 	    dp->dp_spa);
 	if (error != 0)
 		return (error);
 
 	/* 6 extra bytes for /%recv */
 	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
 
 	(void) snprintf(recvname, sizeof (recvname), "%s/%s",
 	    tofs, recv_clone_name);
 
 	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) {
 		/* raw receives require spill block allocation flag */
 		if (!(drrb->drr_flags & DRR_FLAG_SPILL_BLOCK))
 			return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
 	} else {
 		dsflags |= DS_HOLD_FLAG_DECRYPT;
 	}
 
 	boolean_t recvexist = B_TRUE;
 	if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) {
 		/* %recv does not exist; continue in tofs */
 		recvexist = B_FALSE;
 		error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Resume of full/newfs recv on existing dataset should be done with
 	 * force flag
 	 */
 	if (recvexist && drrb->drr_fromguid == 0 && !drc->drc_force) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(ZFS_ERR_RESUME_EXISTS));
 	}
 
 	/* check that ds is marked inconsistent */
 	if (!DS_IS_INCONSISTENT(ds)) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/* check that there is resuming data, and that the toguid matches */
 	if (!dsl_dataset_is_zapified(ds)) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 	uint64_t val;
 	error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
 	if (error != 0 || drrb->drr_toguid != val) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Check if the receive is still running.  If so, it will be owned.
 	 * Note that nothing else can own the dataset (e.g. after the receive
 	 * fails) because it will be marked inconsistent.
 	 */
 	if (dsl_dataset_has_owner(ds)) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EBUSY));
 	}
 
 	/* There should not be any snapshots of this fs yet. */
 	if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Note: resume point will be checked when we process the first WRITE
 	 * record.
 	 */
 
 	/* check that the origin matches */
 	val = 0;
 	(void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
 	if (drrb->drr_fromguid != val) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (ds->ds_prev != NULL && drrb->drr_fromguid != 0)
 		drc->drc_fromsnapobj = ds->ds_prev->ds_object;
 
 	/*
 	 * If we're resuming, and the send is redacted, then the original send
 	 * must have been redacted, and must have been redacted with respect to
 	 * the same snapshots.
 	 */
 	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_REDACTED) {
 		uint64_t num_ds_redact_snaps;
 		uint64_t *ds_redact_snaps;
 
 		uint_t num_stream_redact_snaps;
 		uint64_t *stream_redact_snaps;
 
 		if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
 		    BEGINNV_REDACT_SNAPS, &stream_redact_snaps,
 		    &num_stream_redact_snaps) != 0) {
 			dsl_dataset_rele_flags(ds, dsflags, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		if (!dsl_dataset_get_uint64_array_feature(ds,
 		    SPA_FEATURE_REDACTED_DATASETS, &num_ds_redact_snaps,
 		    &ds_redact_snaps)) {
 			dsl_dataset_rele_flags(ds, dsflags, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		for (int i = 0; i < num_ds_redact_snaps; i++) {
 			if (!redact_snaps_contains(ds_redact_snaps,
 			    num_ds_redact_snaps, stream_redact_snaps[i])) {
 				dsl_dataset_rele_flags(ds, dsflags, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 		}
 	}
 
 	error = recv_check_large_blocks(ds, drc->drc_featureflags);
 	if (error != 0) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (error);
 	}
 
 	dsl_dataset_rele_flags(ds, dsflags, FTAG);
 	return (0);
 }
 
 static void
 dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	const char *tofs = drba->drba_cookie->drc_tofs;
 	uint64_t featureflags = drba->drba_cookie->drc_featureflags;
 	dsl_dataset_t *ds;
 	ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
 	/* 6 extra bytes for /%recv */
 	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
 
 	(void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs,
 	    recv_clone_name);
 
 	if (featureflags & DMU_BACKUP_FEATURE_RAW) {
 		drba->drba_cookie->drc_raw = B_TRUE;
 	} else {
 		dsflags |= DS_HOLD_FLAG_DECRYPT;
 	}
 
 	if (dsl_dataset_own_force(dp, recvname, dsflags, dmu_recv_tag, &ds)
 	    != 0) {
 		/* %recv does not exist; continue in tofs */
 		VERIFY0(dsl_dataset_own_force(dp, tofs, dsflags, dmu_recv_tag,
 		    &ds));
 		drba->drba_cookie->drc_newfs = B_TRUE;
 	}
 
 	ASSERT(DS_IS_INCONSISTENT(ds));
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) ||
 	    drba->drba_cookie->drc_raw);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 	drba->drba_cookie->drc_ds = ds;
 	VERIFY0(dmu_objset_from_ds(ds, &drba->drba_cookie->drc_os));
 	drba->drba_cookie->drc_should_save = B_TRUE;
 
 	spa_history_log_internal_ds(ds, "resume receive", tx, " ");
 }
 
 /*
  * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
  * succeeds; otherwise we will leak the holds on the datasets.
  */
 int
 dmu_recv_begin(const char *tofs, const char *tosnap,
     dmu_replay_record_t *drr_begin, boolean_t force, boolean_t heal,
     boolean_t resumable, nvlist_t *localprops, nvlist_t *hidden_args,
     const char *origin, dmu_recv_cookie_t *drc, zfs_file_t *fp,
     offset_t *voffp)
 {
 	dmu_recv_begin_arg_t drba = { 0 };
 	int err = 0;
 
 	memset(drc, 0, sizeof (dmu_recv_cookie_t));
 	drc->drc_drr_begin = drr_begin;
 	drc->drc_drrb = &drr_begin->drr_u.drr_begin;
 	drc->drc_tosnap = tosnap;
 	drc->drc_tofs = tofs;
 	drc->drc_force = force;
 	drc->drc_heal = heal;
 	drc->drc_resumable = resumable;
 	drc->drc_cred = CRED();
 	drc->drc_proc = curproc;
 	drc->drc_clone = (origin != NULL);
 
 	if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
 		drc->drc_byteswap = B_TRUE;
 		(void) fletcher_4_incremental_byteswap(drr_begin,
 		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
 		byteswap_record(drr_begin);
 	} else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
 		(void) fletcher_4_incremental_native(drr_begin,
 		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
 	} else {
 		return (SET_ERROR(EINVAL));
 	}
 
 	drc->drc_fp = fp;
 	drc->drc_voff = *voffp;
 	drc->drc_featureflags =
 	    DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
 
 	uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen;
 
 	/*
 	 * Since OpenZFS 2.0.0, we have enforced a 64MB limit in userspace
 	 * configurable via ZFS_SENDRECV_MAX_NVLIST. We enforce 256MB as a hard
 	 * upper limit. Systems with less than 1GB of RAM will see a lower
 	 * limit from `arc_all_memory() / 4`.
 	 */
 	if (payloadlen > (MIN((1U << 28), arc_all_memory() / 4)))
 		return (E2BIG);
 
 
 	if (payloadlen != 0) {
 		void *payload = vmem_alloc(payloadlen, KM_SLEEP);
 		/*
 		 * For compatibility with recursive send streams, we don't do
 		 * this here if the stream could be part of a package. Instead,
 		 * we'll do it in dmu_recv_stream. If we pull the next header
 		 * too early, and it's the END record, we break the `recv_skip`
 		 * logic.
 		 */
 
 		err = receive_read_payload_and_next_header(drc, payloadlen,
 		    payload);
 		if (err != 0) {
 			vmem_free(payload, payloadlen);
 			return (err);
 		}
 		err = nvlist_unpack(payload, payloadlen, &drc->drc_begin_nvl,
 		    KM_SLEEP);
 		vmem_free(payload, payloadlen);
 		if (err != 0) {
 			kmem_free(drc->drc_next_rrd,
 			    sizeof (*drc->drc_next_rrd));
 			return (err);
 		}
 	}
 
 	if (drc->drc_drrb->drr_flags & DRR_FLAG_SPILL_BLOCK)
 		drc->drc_spill = B_TRUE;
 
 	drba.drba_origin = origin;
 	drba.drba_cookie = drc;
 	drba.drba_cred = CRED();
 	drba.drba_proc = curproc;
 
 	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) {
 		err = dsl_sync_task(tofs,
 		    dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
 		    &drba, 5, ZFS_SPACE_CHECK_NORMAL);
 	} else {
 		/*
 		 * For non-raw, non-incremental, non-resuming receives the
 		 * user can specify encryption parameters on the command line
 		 * with "zfs recv -o". For these receives we create a dcp and
 		 * pass it to the sync task. Creating the dcp will implicitly
 		 * remove the encryption params from the localprops nvlist,
 		 * which avoids errors when trying to set these normally
 		 * read-only properties. Any other kind of receive that
 		 * attempts to set these properties will fail as a result.
 		 */
 		if ((DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
 		    DMU_BACKUP_FEATURE_RAW) == 0 &&
 		    origin == NULL && drc->drc_drrb->drr_fromguid == 0) {
 			err = dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
 			    localprops, hidden_args, &drba.drba_dcp);
 		}
 
 		if (err == 0) {
 			err = dsl_sync_task(tofs,
 			    dmu_recv_begin_check, dmu_recv_begin_sync,
 			    &drba, 5, ZFS_SPACE_CHECK_NORMAL);
 			dsl_crypto_params_free(drba.drba_dcp, !!err);
 		}
 	}
 
 	if (err != 0) {
 		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
 		nvlist_free(drc->drc_begin_nvl);
 	}
 	return (err);
 }
 
 /*
  * Holds data need for corrective recv callback
  */
 typedef struct cr_cb_data {
 	uint64_t size;
 	zbookmark_phys_t zb;
 	spa_t *spa;
 } cr_cb_data_t;
 
 static void
 corrective_read_done(zio_t *zio)
 {
 	cr_cb_data_t *data = zio->io_private;
 	/* Corruption corrected; update error log if needed */
-	if (zio->io_error == 0)
-		spa_remove_error(data->spa, &data->zb, &zio->io_bp->blk_birth);
+	if (zio->io_error == 0) {
+		spa_remove_error(data->spa, &data->zb,
+		    BP_GET_LOGICAL_BIRTH(zio->io_bp));
+	}
 	kmem_free(data, sizeof (cr_cb_data_t));
 	abd_free(zio->io_abd);
 }
 
 /*
  * zio_rewrite the data pointed to by bp with the data from the rrd's abd.
  */
 static int
 do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
     struct receive_record_arg *rrd, blkptr_t *bp)
 {
 	int err;
 	zio_t *io;
 	zbookmark_phys_t zb;
 	dnode_t *dn;
 	abd_t *abd = rrd->abd;
 	zio_cksum_t bp_cksum = bp->blk_cksum;
 	zio_flag_t flags = ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_RETRY |
 	    ZIO_FLAG_CANFAIL;
 
 	if (rwa->raw)
 		flags |= ZIO_FLAG_RAW;
 
 	err = dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 	SET_BOOKMARK(&zb, dmu_objset_id(rwa->os), drrw->drr_object, 0,
 	    dbuf_whichblock(dn, 0, drrw->drr_offset));
 	dnode_rele(dn, FTAG);
 
 	if (!rwa->raw && DRR_WRITE_COMPRESSED(drrw)) {
 		/* Decompress the stream data */
 		abd_t *dabd = abd_alloc_linear(
 		    drrw->drr_logical_size, B_FALSE);
 		err = zio_decompress_data(drrw->drr_compressiontype,
 		    abd, abd_to_buf(dabd), abd_get_size(abd),
 		    abd_get_size(dabd), NULL);
 
 		if (err != 0) {
 			abd_free(dabd);
 			return (err);
 		}
 		/* Swap in the newly decompressed data into the abd */
 		abd_free(abd);
 		abd = dabd;
 	}
 
 	if (!rwa->raw && BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 		/* Recompress the data */
 		abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp),
 		    B_FALSE);
 		void *buf = abd_to_buf(cabd);
 		uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp),
 		    abd, &buf, abd_get_size(abd),
 		    rwa->os->os_complevel);
 		abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize);
 		/* Swap in newly compressed data into the abd */
 		abd_free(abd);
 		abd = cabd;
 		flags |= ZIO_FLAG_RAW_COMPRESS;
 	}
 
 	/*
 	 * The stream is not encrypted but the data on-disk is.
 	 * We need to re-encrypt the buf using the same
 	 * encryption type, salt, iv, and mac that was used to encrypt
 	 * the block previosly.
 	 */
 	if (!rwa->raw && BP_USES_CRYPT(bp)) {
 		dsl_dataset_t *ds;
 		dsl_crypto_key_t *dck = NULL;
 		uint8_t salt[ZIO_DATA_SALT_LEN];
 		uint8_t iv[ZIO_DATA_IV_LEN];
 		uint8_t mac[ZIO_DATA_MAC_LEN];
 		boolean_t no_crypt = B_FALSE;
 		dsl_pool_t *dp = dmu_objset_pool(rwa->os);
 		abd_t *eabd = abd_alloc_linear(BP_GET_PSIZE(bp), B_FALSE);
 
 		zio_crypt_decode_params_bp(bp, salt, iv);
 		zio_crypt_decode_mac_bp(bp, mac);
 
 		dsl_pool_config_enter(dp, FTAG);
 		err = dsl_dataset_hold_flags(dp, rwa->tofs,
 		    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
 		if (err != 0) {
 			dsl_pool_config_exit(dp, FTAG);
 			abd_free(eabd);
 			return (SET_ERROR(EACCES));
 		}
 
 		/* Look up the key from the spa's keystore */
 		err = spa_keystore_lookup_key(rwa->os->os_spa,
 		    zb.zb_objset, FTAG, &dck);
 		if (err != 0) {
 			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT,
 			    FTAG);
 			dsl_pool_config_exit(dp, FTAG);
 			abd_free(eabd);
 			return (SET_ERROR(EACCES));
 		}
 
 		err = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
 		    BP_GET_TYPE(bp), BP_SHOULD_BYTESWAP(bp), salt, iv,
 		    mac, abd_get_size(abd), abd, eabd, &no_crypt);
 
 		spa_keystore_dsl_key_rele(rwa->os->os_spa, dck, FTAG);
 		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 		dsl_pool_config_exit(dp, FTAG);
 
 		ASSERT0(no_crypt);
 		if (err != 0) {
 			abd_free(eabd);
 			return (err);
 		}
 		/* Swap in the newly encrypted data into the abd */
 		abd_free(abd);
 		abd = eabd;
 
 		/*
 		 * We want to prevent zio_rewrite() from trying to
 		 * encrypt the data again
 		 */
 		flags |= ZIO_FLAG_RAW_ENCRYPT;
 	}
 	rrd->abd = abd;
 
-	io = zio_rewrite(NULL, rwa->os->os_spa, bp->blk_birth, bp, abd,
-	    BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags, &zb);
+	io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_LOGICAL_BIRTH(bp), bp,
+	    abd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags,
+	    &zb);
 
 	ASSERT(abd_get_size(abd) == BP_GET_LSIZE(bp) ||
 	    abd_get_size(abd) == BP_GET_PSIZE(bp));
 
 	/* compute new bp checksum value and make sure it matches the old one */
 	zio_checksum_compute(io, BP_GET_CHECKSUM(bp), abd, abd_get_size(abd));
 	if (!ZIO_CHECKSUM_EQUAL(bp_cksum, io->io_bp->blk_cksum)) {
 		zio_destroy(io);
 		if (zfs_recv_best_effort_corrective != 0)
 			return (0);
 		return (SET_ERROR(ECKSUM));
 	}
 
 	/* Correct the corruption in place */
 	err = zio_wait(io);
 	if (err == 0) {
 		cr_cb_data_t *cb_data =
 		    kmem_alloc(sizeof (cr_cb_data_t), KM_SLEEP);
 		cb_data->spa = rwa->os->os_spa;
 		cb_data->size = drrw->drr_logical_size;
 		cb_data->zb = zb;
 		/* Test if healing worked by re-reading the bp */
 		err = zio_wait(zio_read(rwa->heal_pio, rwa->os->os_spa, bp,
 		    abd_alloc_for_io(drrw->drr_logical_size, B_FALSE),
 		    drrw->drr_logical_size, corrective_read_done,
 		    cb_data, ZIO_PRIORITY_ASYNC_READ, flags, NULL));
 	}
 	if (err != 0 && zfs_recv_best_effort_corrective != 0)
 		err = 0;
 
 	return (err);
 }
 
 static int
 receive_read(dmu_recv_cookie_t *drc, int len, void *buf)
 {
 	int done = 0;
 
 	/*
 	 * The code doesn't rely on this (lengths being multiples of 8).  See
 	 * comment in dump_bytes.
 	 */
 	ASSERT(len % 8 == 0 ||
 	    (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) != 0);
 
 	while (done < len) {
 		ssize_t resid = len - done;
 		zfs_file_t *fp = drc->drc_fp;
 		int err = zfs_file_read(fp, (char *)buf + done,
 		    len - done, &resid);
 		if (err == 0 && resid == len - done) {
 			/*
 			 * Note: ECKSUM or ZFS_ERR_STREAM_TRUNCATED indicates
 			 * that the receive was interrupted and can
 			 * potentially be resumed.
 			 */
 			err = SET_ERROR(ZFS_ERR_STREAM_TRUNCATED);
 		}
 		drc->drc_voff += len - done - resid;
 		done = len - resid;
 		if (err != 0)
 			return (err);
 	}
 
 	drc->drc_bytes_read += len;
 
 	ASSERT3U(done, ==, len);
 	return (0);
 }
 
 static inline uint8_t
 deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
 {
 	if (bonus_type == DMU_OT_SA) {
 		return (1);
 	} else {
 		return (1 +
 		    ((DN_OLD_MAX_BONUSLEN -
 		    MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT));
 	}
 }
 
 static void
 save_resume_state(struct receive_writer_arg *rwa,
     uint64_t object, uint64_t offset, dmu_tx_t *tx)
 {
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 
 	if (!rwa->resumable)
 		return;
 
 	/*
 	 * We use ds_resume_bytes[] != 0 to indicate that we need to
 	 * update this on disk, so it must not be 0.
 	 */
 	ASSERT(rwa->bytes_read != 0);
 
 	/*
 	 * We only resume from write records, which have a valid
 	 * (non-meta-dnode) object number.
 	 */
 	ASSERT(object != 0);
 
 	/*
 	 * For resuming to work correctly, we must receive records in order,
 	 * sorted by object,offset.  This is checked by the callers, but
 	 * assert it here for good measure.
 	 */
 	ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]);
 	ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] ||
 	    offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]);
 	ASSERT3U(rwa->bytes_read, >=,
 	    rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]);
 
 	rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object;
 	rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset;
 	rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read;
 }
 
 static int
 receive_object_is_same_generation(objset_t *os, uint64_t object,
     dmu_object_type_t old_bonus_type, dmu_object_type_t new_bonus_type,
     const void *new_bonus, boolean_t *samegenp)
 {
 	zfs_file_info_t zoi;
 	int err;
 
 	dmu_buf_t *old_bonus_dbuf;
 	err = dmu_bonus_hold(os, object, FTAG, &old_bonus_dbuf);
 	if (err != 0)
 		return (err);
 	err = dmu_get_file_info(os, old_bonus_type, old_bonus_dbuf->db_data,
 	    &zoi);
 	dmu_buf_rele(old_bonus_dbuf, FTAG);
 	if (err != 0)
 		return (err);
 	uint64_t old_gen = zoi.zfi_generation;
 
 	err = dmu_get_file_info(os, new_bonus_type, new_bonus, &zoi);
 	if (err != 0)
 		return (err);
 	uint64_t new_gen = zoi.zfi_generation;
 
 	*samegenp = (old_gen == new_gen);
 	return (0);
 }
 
 static int
 receive_handle_existing_object(const struct receive_writer_arg *rwa,
     const struct drr_object *drro, const dmu_object_info_t *doi,
     const void *bonus_data,
     uint64_t *object_to_hold, uint32_t *new_blksz)
 {
 	uint32_t indblksz = drro->drr_indblkshift ?
 	    1ULL << drro->drr_indblkshift : 0;
 	int nblkptr = deduce_nblkptr(drro->drr_bonustype,
 	    drro->drr_bonuslen);
 	uint8_t dn_slots = drro->drr_dn_slots != 0 ?
 	    drro->drr_dn_slots : DNODE_MIN_SLOTS;
 	boolean_t do_free_range = B_FALSE;
 	int err;
 
 	*object_to_hold = drro->drr_object;
 
 	/* nblkptr should be bounded by the bonus size and type */
 	if (rwa->raw && nblkptr != drro->drr_nblkptr)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * After the previous send stream, the sending system may
 	 * have freed this object, and then happened to re-allocate
 	 * this object number in a later txg. In this case, we are
 	 * receiving a different logical file, and the block size may
 	 * appear to be different.  i.e. we may have a different
 	 * block size for this object than what the send stream says.
 	 * In this case we need to remove the object's contents,
 	 * so that its structure can be changed and then its contents
 	 * entirely replaced by subsequent WRITE records.
 	 *
 	 * If this is a -L (--large-block) incremental stream, and
 	 * the previous stream was not -L, the block size may appear
 	 * to increase.  i.e. we may have a smaller block size for
 	 * this object than what the send stream says.  In this case
 	 * we need to keep the object's contents and block size
 	 * intact, so that we don't lose parts of the object's
 	 * contents that are not changed by this incremental send
 	 * stream.
 	 *
 	 * We can distinguish between the two above cases by using
 	 * the ZPL's generation number (see
 	 * receive_object_is_same_generation()).  However, we only
 	 * want to rely on the generation number when absolutely
 	 * necessary, because with raw receives, the generation is
 	 * encrypted.  We also want to minimize dependence on the
 	 * ZPL, so that other types of datasets can also be received
 	 * (e.g. ZVOLs, although note that ZVOLS currently do not
 	 * reallocate their objects or change their structure).
 	 * Therefore, we check a number of different cases where we
 	 * know it is safe to discard the object's contents, before
 	 * using the ZPL's generation number to make the above
 	 * distinction.
 	 */
 	if (drro->drr_blksz != doi->doi_data_block_size) {
 		if (rwa->raw) {
 			/*
 			 * RAW streams always have large blocks, so
 			 * we are sure that the data is not needed
 			 * due to changing --large-block to be on.
 			 * Which is fortunate since the bonus buffer
 			 * (which contains the ZPL generation) is
 			 * encrypted, and the key might not be
 			 * loaded.
 			 */
 			do_free_range = B_TRUE;
 		} else if (rwa->full) {
 			/*
 			 * This is a full send stream, so it always
 			 * replaces what we have.  Even if the
 			 * generation numbers happen to match, this
 			 * can not actually be the same logical file.
 			 * This is relevant when receiving a full
 			 * send as a clone.
 			 */
 			do_free_range = B_TRUE;
 		} else if (drro->drr_type !=
 		    DMU_OT_PLAIN_FILE_CONTENTS ||
 		    doi->doi_type != DMU_OT_PLAIN_FILE_CONTENTS) {
 			/*
 			 * PLAIN_FILE_CONTENTS are the only type of
 			 * objects that have ever been stored with
 			 * large blocks, so we don't need the special
 			 * logic below.  ZAP blocks can shrink (when
 			 * there's only one block), so we don't want
 			 * to hit the error below about block size
 			 * only increasing.
 			 */
 			do_free_range = B_TRUE;
 		} else if (doi->doi_max_offset <=
 		    doi->doi_data_block_size) {
 			/*
 			 * There is only one block.  We can free it,
 			 * because its contents will be replaced by a
 			 * WRITE record.  This can not be the no-L ->
 			 * -L case, because the no-L case would have
 			 * resulted in multiple blocks.  If we
 			 * supported -L -> no-L, it would not be safe
 			 * to free the file's contents.  Fortunately,
 			 * that is not allowed (see
 			 * recv_check_large_blocks()).
 			 */
 			do_free_range = B_TRUE;
 		} else {
 			boolean_t is_same_gen;
 			err = receive_object_is_same_generation(rwa->os,
 			    drro->drr_object, doi->doi_bonus_type,
 			    drro->drr_bonustype, bonus_data, &is_same_gen);
 			if (err != 0)
 				return (SET_ERROR(EINVAL));
 
 			if (is_same_gen) {
 				/*
 				 * This is the same logical file, and
 				 * the block size must be increasing.
 				 * It could only decrease if
 				 * --large-block was changed to be
 				 * off, which is checked in
 				 * recv_check_large_blocks().
 				 */
 				if (drro->drr_blksz <=
 				    doi->doi_data_block_size)
 					return (SET_ERROR(EINVAL));
 				/*
 				 * We keep the existing blocksize and
 				 * contents.
 				 */
 				*new_blksz =
 				    doi->doi_data_block_size;
 			} else {
 				do_free_range = B_TRUE;
 			}
 		}
 	}
 
 	/* nblkptr can only decrease if the object was reallocated */
 	if (nblkptr < doi->doi_nblkptr)
 		do_free_range = B_TRUE;
 
 	/* number of slots can only change on reallocation */
 	if (dn_slots != doi->doi_dnodesize >> DNODE_SHIFT)
 		do_free_range = B_TRUE;
 
 	/*
 	 * For raw sends we also check a few other fields to
 	 * ensure we are preserving the objset structure exactly
 	 * as it was on the receive side:
 	 *     - A changed indirect block size
 	 *     - A smaller nlevels
 	 */
 	if (rwa->raw) {
 		if (indblksz != doi->doi_metadata_block_size)
 			do_free_range = B_TRUE;
 		if (drro->drr_nlevels < doi->doi_indirection)
 			do_free_range = B_TRUE;
 	}
 
 	if (do_free_range) {
 		err = dmu_free_long_range(rwa->os, drro->drr_object,
 		    0, DMU_OBJECT_END);
 		if (err != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * The dmu does not currently support decreasing nlevels or changing
 	 * indirect block size if there is already one, same as changing the
 	 * number of of dnode slots on an object.  For non-raw sends this
 	 * does not matter and the new object can just use the previous one's
 	 * parameters.  For raw sends, however, the structure of the received
 	 * dnode (including indirects and dnode slots) must match that of the
 	 * send side.  Therefore, instead of using dmu_object_reclaim(), we
 	 * must free the object completely and call dmu_object_claim_dnsize()
 	 * instead.
 	 */
 	if ((rwa->raw && ((doi->doi_indirection > 1 &&
 	    indblksz != doi->doi_metadata_block_size) ||
 	    drro->drr_nlevels < doi->doi_indirection)) ||
 	    dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) {
 		err = dmu_free_long_object(rwa->os, drro->drr_object);
 		if (err != 0)
 			return (SET_ERROR(EINVAL));
 
 		txg_wait_synced(dmu_objset_pool(rwa->os), 0);
 		*object_to_hold = DMU_NEW_OBJECT;
 	}
 
 	/*
 	 * For raw receives, free everything beyond the new incoming
 	 * maxblkid. Normally this would be done with a DRR_FREE
 	 * record that would come after this DRR_OBJECT record is
 	 * processed. However, for raw receives we manually set the
 	 * maxblkid from the drr_maxblkid and so we must first free
 	 * everything above that blkid to ensure the DMU is always
 	 * consistent with itself. We will never free the first block
 	 * of the object here because a maxblkid of 0 could indicate
 	 * an object with a single block or one with no blocks. This
 	 * free may be skipped when dmu_free_long_range() was called
 	 * above since it covers the entire object's contents.
 	 */
 	if (rwa->raw && *object_to_hold != DMU_NEW_OBJECT && !do_free_range) {
 		err = dmu_free_long_range(rwa->os, drro->drr_object,
 		    (drro->drr_maxblkid + 1) * doi->doi_data_block_size,
 		    DMU_OBJECT_END);
 		if (err != 0)
 			return (SET_ERROR(EINVAL));
 	}
 	return (0);
 }
 
 noinline static int
 receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
     void *data)
 {
 	dmu_object_info_t doi;
 	dmu_tx_t *tx;
 	int err;
 	uint32_t new_blksz = drro->drr_blksz;
 	uint8_t dn_slots = drro->drr_dn_slots != 0 ?
 	    drro->drr_dn_slots : DNODE_MIN_SLOTS;
 
 	if (drro->drr_type == DMU_OT_NONE ||
 	    !DMU_OT_IS_VALID(drro->drr_type) ||
 	    !DMU_OT_IS_VALID(drro->drr_bonustype) ||
 	    drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
 	    drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
 	    drro->drr_bonuslen >
 	    DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) ||
 	    dn_slots >
 	    (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (rwa->raw) {
 		/*
 		 * We should have received a DRR_OBJECT_RANGE record
 		 * containing this block and stored it in rwa.
 		 */
 		if (drro->drr_object < rwa->or_firstobj ||
 		    drro->drr_object >= rwa->or_firstobj + rwa->or_numslots ||
 		    drro->drr_raw_bonuslen < drro->drr_bonuslen ||
 		    drro->drr_indblkshift > SPA_MAXBLOCKSHIFT ||
 		    drro->drr_nlevels > DN_MAX_LEVELS ||
 		    drro->drr_nblkptr > DN_MAX_NBLKPTR ||
 		    DN_SLOTS_TO_BONUSLEN(dn_slots) <
 		    drro->drr_raw_bonuslen)
 			return (SET_ERROR(EINVAL));
 	} else {
 		/*
 		 * The DRR_OBJECT_SPILL flag is valid when the DRR_BEGIN
 		 * record indicates this by setting DRR_FLAG_SPILL_BLOCK.
 		 */
 		if (((drro->drr_flags & ~(DRR_OBJECT_SPILL))) ||
 		    (!rwa->spill && DRR_OBJECT_HAS_SPILL(drro->drr_flags))) {
 			return (SET_ERROR(EINVAL));
 		}
 
 		if (drro->drr_raw_bonuslen != 0 || drro->drr_nblkptr != 0 ||
 		    drro->drr_indblkshift != 0 || drro->drr_nlevels != 0) {
 			return (SET_ERROR(EINVAL));
 		}
 	}
 
 	err = dmu_object_info(rwa->os, drro->drr_object, &doi);
 
 	if (err != 0 && err != ENOENT && err != EEXIST)
 		return (SET_ERROR(EINVAL));
 
 	if (drro->drr_object > rwa->max_object)
 		rwa->max_object = drro->drr_object;
 
 	/*
 	 * If we are losing blkptrs or changing the block size this must
 	 * be a new file instance.  We must clear out the previous file
 	 * contents before we can change this type of metadata in the dnode.
 	 * Raw receives will also check that the indirect structure of the
 	 * dnode hasn't changed.
 	 */
 	uint64_t object_to_hold;
 	if (err == 0) {
 		err = receive_handle_existing_object(rwa, drro, &doi, data,
 		    &object_to_hold, &new_blksz);
 		if (err != 0)
 			return (err);
 	} else if (err == EEXIST) {
 		/*
 		 * The object requested is currently an interior slot of a
 		 * multi-slot dnode. This will be resolved when the next txg
 		 * is synced out, since the send stream will have told us
 		 * to free this slot when we freed the associated dnode
 		 * earlier in the stream.
 		 */
 		txg_wait_synced(dmu_objset_pool(rwa->os), 0);
 
 		if (dmu_object_info(rwa->os, drro->drr_object, NULL) != ENOENT)
 			return (SET_ERROR(EINVAL));
 
 		/* object was freed and we are about to allocate a new one */
 		object_to_hold = DMU_NEW_OBJECT;
 	} else {
 		/*
 		 * If the only record in this range so far was DRR_FREEOBJECTS
 		 * with at least one actually freed object, it's possible that
 		 * the block will now be converted to a hole. We need to wait
 		 * for the txg to sync to prevent races.
 		 */
 		if (rwa->or_need_sync == ORNS_YES)
 			txg_wait_synced(dmu_objset_pool(rwa->os), 0);
 
 		/* object is free and we are about to allocate a new one */
 		object_to_hold = DMU_NEW_OBJECT;
 	}
 
 	/* Only relevant for the first object in the range */
 	rwa->or_need_sync = ORNS_NO;
 
 	/*
 	 * If this is a multi-slot dnode there is a chance that this
 	 * object will expand into a slot that is already used by
 	 * another object from the previous snapshot. We must free
 	 * these objects before we attempt to allocate the new dnode.
 	 */
 	if (dn_slots > 1) {
 		boolean_t need_sync = B_FALSE;
 
 		for (uint64_t slot = drro->drr_object + 1;
 		    slot < drro->drr_object + dn_slots;
 		    slot++) {
 			dmu_object_info_t slot_doi;
 
 			err = dmu_object_info(rwa->os, slot, &slot_doi);
 			if (err == ENOENT || err == EEXIST)
 				continue;
 			else if (err != 0)
 				return (err);
 
 			err = dmu_free_long_object(rwa->os, slot);
 			if (err != 0)
 				return (err);
 
 			need_sync = B_TRUE;
 		}
 
 		if (need_sync)
 			txg_wait_synced(dmu_objset_pool(rwa->os), 0);
 	}
 
 	tx = dmu_tx_create(rwa->os);
 	dmu_tx_hold_bonus(tx, object_to_hold);
 	dmu_tx_hold_write(tx, object_to_hold, 0, 0);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	if (object_to_hold == DMU_NEW_OBJECT) {
 		/* Currently free, wants to be allocated */
 		err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
 		    drro->drr_type, new_blksz,
 		    drro->drr_bonustype, drro->drr_bonuslen,
 		    dn_slots << DNODE_SHIFT, tx);
 	} else if (drro->drr_type != doi.doi_type ||
 	    new_blksz != doi.doi_data_block_size ||
 	    drro->drr_bonustype != doi.doi_bonus_type ||
 	    drro->drr_bonuslen != doi.doi_bonus_size) {
 		/* Currently allocated, but with different properties */
 		err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object,
 		    drro->drr_type, new_blksz,
 		    drro->drr_bonustype, drro->drr_bonuslen,
 		    dn_slots << DNODE_SHIFT, rwa->spill ?
 		    DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx);
 	} else if (rwa->spill && !DRR_OBJECT_HAS_SPILL(drro->drr_flags)) {
 		/*
 		 * Currently allocated, the existing version of this object
 		 * may reference a spill block that is no longer allocated
 		 * at the source and needs to be freed.
 		 */
 		err = dmu_object_rm_spill(rwa->os, drro->drr_object, tx);
 	}
 
 	if (err != 0) {
 		dmu_tx_commit(tx);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (rwa->or_crypt_params_present) {
 		/*
 		 * Set the crypt params for the buffer associated with this
 		 * range of dnodes.  This causes the blkptr_t to have the
 		 * same crypt params (byteorder, salt, iv, mac) as on the
 		 * sending side.
 		 *
 		 * Since we are committing this tx now, it is possible for
 		 * the dnode block to end up on-disk with the incorrect MAC,
 		 * if subsequent objects in this block are received in a
 		 * different txg.  However, since the dataset is marked as
 		 * inconsistent, no code paths will do a non-raw read (or
 		 * decrypt the block / verify the MAC). The receive code and
 		 * scrub code can safely do raw reads and verify the
 		 * checksum.  They don't need to verify the MAC.
 		 */
 		dmu_buf_t *db = NULL;
 		uint64_t offset = rwa->or_firstobj * DNODE_MIN_SIZE;
 
 		err = dmu_buf_hold_by_dnode(DMU_META_DNODE(rwa->os),
 		    offset, FTAG, &db, DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT);
 		if (err != 0) {
 			dmu_tx_commit(tx);
 			return (SET_ERROR(EINVAL));
 		}
 
 		dmu_buf_set_crypt_params(db, rwa->or_byteorder,
 		    rwa->or_salt, rwa->or_iv, rwa->or_mac, tx);
 
 		dmu_buf_rele(db, FTAG);
 
 		rwa->or_crypt_params_present = B_FALSE;
 	}
 
 	dmu_object_set_checksum(rwa->os, drro->drr_object,
 	    drro->drr_checksumtype, tx);
 	dmu_object_set_compress(rwa->os, drro->drr_object,
 	    drro->drr_compress, tx);
 
 	/* handle more restrictive dnode structuring for raw recvs */
 	if (rwa->raw) {
 		/*
 		 * Set the indirect block size, block shift, nlevels.
 		 * This will not fail because we ensured all of the
 		 * blocks were freed earlier if this is a new object.
 		 * For non-new objects block size and indirect block
 		 * shift cannot change and nlevels can only increase.
 		 */
 		ASSERT3U(new_blksz, ==, drro->drr_blksz);
 		VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object,
 		    drro->drr_blksz, drro->drr_indblkshift, tx));
 		VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object,
 		    drro->drr_nlevels, tx));
 
 		/*
 		 * Set the maxblkid. This will always succeed because
 		 * we freed all blocks beyond the new maxblkid above.
 		 */
 		VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object,
 		    drro->drr_maxblkid, tx));
 	}
 
 	if (data != NULL) {
 		dmu_buf_t *db;
 		dnode_t *dn;
 		uint32_t flags = DMU_READ_NO_PREFETCH;
 
 		if (rwa->raw)
 			flags |= DMU_READ_NO_DECRYPT;
 
 		VERIFY0(dnode_hold(rwa->os, drro->drr_object, FTAG, &dn));
 		VERIFY0(dmu_bonus_hold_by_dnode(dn, FTAG, &db, flags));
 
 		dmu_buf_will_dirty(db, tx);
 
 		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
 		memcpy(db->db_data, data, DRR_OBJECT_PAYLOAD_SIZE(drro));
 
 		/*
 		 * Raw bonus buffers have their byteorder determined by the
 		 * DRR_OBJECT_RANGE record.
 		 */
 		if (rwa->byteswap && !rwa->raw) {
 			dmu_object_byteswap_t byteswap =
 			    DMU_OT_BYTESWAP(drro->drr_bonustype);
 			dmu_ot_byteswap[byteswap].ob_func(db->db_data,
 			    DRR_OBJECT_PAYLOAD_SIZE(drro));
 		}
 		dmu_buf_rele(db, FTAG);
 		dnode_rele(dn, FTAG);
 	}
 
 	/*
 	 * If the receive fails, we want the resume stream to start with the
 	 * same record that we last successfully received. There is no way to
 	 * request resume from the object record, but we can benefit from the
 	 * fact that sender always sends object record before anything else,
 	 * after which it will "resend" data at offset 0 and resume normally.
 	 */
 	save_resume_state(rwa, drro->drr_object, 0, tx);
 
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
 noinline static int
 receive_freeobjects(struct receive_writer_arg *rwa,
     struct drr_freeobjects *drrfo)
 {
 	uint64_t obj;
 	int next_err = 0;
 
 	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
 		return (SET_ERROR(EINVAL));
 
 	for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
 	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs &&
 	    obj < DN_MAX_OBJECT && next_err == 0;
 	    next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
 		dmu_object_info_t doi;
 		int err;
 
 		err = dmu_object_info(rwa->os, obj, &doi);
 		if (err == ENOENT)
 			continue;
 		else if (err != 0)
 			return (err);
 
 		err = dmu_free_long_object(rwa->os, obj);
 
 		if (err != 0)
 			return (err);
 
 		if (rwa->or_need_sync == ORNS_MAYBE)
 			rwa->or_need_sync = ORNS_YES;
 	}
 	if (next_err != ESRCH)
 		return (next_err);
 	return (0);
 }
 
 /*
  * Note: if this fails, the caller will clean up any records left on the
  * rwa->write_batch list.
  */
 static int
 flush_write_batch_impl(struct receive_writer_arg *rwa)
 {
 	dnode_t *dn;
 	int err;
 
 	if (dnode_hold(rwa->os, rwa->last_object, FTAG, &dn) != 0)
 		return (SET_ERROR(EINVAL));
 
 	struct receive_record_arg *last_rrd = list_tail(&rwa->write_batch);
 	struct drr_write *last_drrw = &last_rrd->header.drr_u.drr_write;
 
 	struct receive_record_arg *first_rrd = list_head(&rwa->write_batch);
 	struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write;
 
 	ASSERT3U(rwa->last_object, ==, last_drrw->drr_object);
 	ASSERT3U(rwa->last_offset, ==, last_drrw->drr_offset);
 
 	dmu_tx_t *tx = dmu_tx_create(rwa->os);
 	dmu_tx_hold_write_by_dnode(tx, dn, first_drrw->drr_offset,
 	    last_drrw->drr_offset - first_drrw->drr_offset +
 	    last_drrw->drr_logical_size);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		dnode_rele(dn, FTAG);
 		return (err);
 	}
 
 	struct receive_record_arg *rrd;
 	while ((rrd = list_head(&rwa->write_batch)) != NULL) {
 		struct drr_write *drrw = &rrd->header.drr_u.drr_write;
 		abd_t *abd = rrd->abd;
 
 		ASSERT3U(drrw->drr_object, ==, rwa->last_object);
 
 		if (drrw->drr_logical_size != dn->dn_datablksz) {
 			/*
 			 * The WRITE record is larger than the object's block
 			 * size.  We must be receiving an incremental
 			 * large-block stream into a dataset that previously did
 			 * a non-large-block receive.  Lightweight writes must
 			 * be exactly one block, so we need to decompress the
 			 * data (if compressed) and do a normal dmu_write().
 			 */
 			ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz);
 			if (DRR_WRITE_COMPRESSED(drrw)) {
 				abd_t *decomp_abd =
 				    abd_alloc_linear(drrw->drr_logical_size,
 				    B_FALSE);
 
 				err = zio_decompress_data(
 				    drrw->drr_compressiontype,
 				    abd, abd_to_buf(decomp_abd),
 				    abd_get_size(abd),
 				    abd_get_size(decomp_abd), NULL);
 
 				if (err == 0) {
 					dmu_write_by_dnode(dn,
 					    drrw->drr_offset,
 					    drrw->drr_logical_size,
 					    abd_to_buf(decomp_abd), tx);
 				}
 				abd_free(decomp_abd);
 			} else {
 				dmu_write_by_dnode(dn,
 				    drrw->drr_offset,
 				    drrw->drr_logical_size,
 				    abd_to_buf(abd), tx);
 			}
 			if (err == 0)
 				abd_free(abd);
 		} else {
 			zio_prop_t zp = {0};
 			dmu_write_policy(rwa->os, dn, 0, 0, &zp);
 
 			zio_flag_t zio_flags = 0;
 
 			if (rwa->raw) {
 				zp.zp_encrypt = B_TRUE;
 				zp.zp_compress = drrw->drr_compressiontype;
 				zp.zp_byteorder = ZFS_HOST_BYTEORDER ^
 				    !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^
 				    rwa->byteswap;
 				memcpy(zp.zp_salt, drrw->drr_salt,
 				    ZIO_DATA_SALT_LEN);
 				memcpy(zp.zp_iv, drrw->drr_iv,
 				    ZIO_DATA_IV_LEN);
 				memcpy(zp.zp_mac, drrw->drr_mac,
 				    ZIO_DATA_MAC_LEN);
 				if (DMU_OT_IS_ENCRYPTED(zp.zp_type)) {
 					zp.zp_nopwrite = B_FALSE;
 					zp.zp_copies = MIN(zp.zp_copies,
 					    SPA_DVAS_PER_BP - 1);
 				}
 				zio_flags |= ZIO_FLAG_RAW;
 			} else if (DRR_WRITE_COMPRESSED(drrw)) {
 				ASSERT3U(drrw->drr_compressed_size, >, 0);
 				ASSERT3U(drrw->drr_logical_size, >=,
 				    drrw->drr_compressed_size);
 				zp.zp_compress = drrw->drr_compressiontype;
 				zio_flags |= ZIO_FLAG_RAW_COMPRESS;
 			} else if (rwa->byteswap) {
 				/*
 				 * Note: compressed blocks never need to be
 				 * byteswapped, because WRITE records for
 				 * metadata blocks are never compressed. The
 				 * exception is raw streams, which are written
 				 * in the original byteorder, and the byteorder
 				 * bit is preserved in the BP by setting
 				 * zp_byteorder above.
 				 */
 				dmu_object_byteswap_t byteswap =
 				    DMU_OT_BYTESWAP(drrw->drr_type);
 				dmu_ot_byteswap[byteswap].ob_func(
 				    abd_to_buf(abd),
 				    DRR_WRITE_PAYLOAD_SIZE(drrw));
 			}
 
 			/*
 			 * Since this data can't be read until the receive
 			 * completes, we can do a "lightweight" write for
 			 * improved performance.
 			 */
 			err = dmu_lightweight_write_by_dnode(dn,
 			    drrw->drr_offset, abd, &zp, zio_flags, tx);
 		}
 
 		if (err != 0) {
 			/*
 			 * This rrd is left on the list, so the caller will
 			 * free it (and the abd).
 			 */
 			break;
 		}
 
 		/*
 		 * Note: If the receive fails, we want the resume stream to
 		 * start with the same record that we last successfully
 		 * received (as opposed to the next record), so that we can
 		 * verify that we are resuming from the correct location.
 		 */
 		save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx);
 
 		list_remove(&rwa->write_batch, rrd);
 		kmem_free(rrd, sizeof (*rrd));
 	}
 
 	dmu_tx_commit(tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 noinline static int
 flush_write_batch(struct receive_writer_arg *rwa)
 {
 	if (list_is_empty(&rwa->write_batch))
 		return (0);
 	int err = rwa->err;
 	if (err == 0)
 		err = flush_write_batch_impl(rwa);
 	if (err != 0) {
 		struct receive_record_arg *rrd;
 		while ((rrd = list_remove_head(&rwa->write_batch)) != NULL) {
 			abd_free(rrd->abd);
 			kmem_free(rrd, sizeof (*rrd));
 		}
 	}
 	ASSERT(list_is_empty(&rwa->write_batch));
 	return (err);
 }
 
 noinline static int
 receive_process_write_record(struct receive_writer_arg *rwa,
     struct receive_record_arg *rrd)
 {
 	int err = 0;
 
 	ASSERT3U(rrd->header.drr_type, ==, DRR_WRITE);
 	struct drr_write *drrw = &rrd->header.drr_u.drr_write;
 
 	if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset ||
 	    !DMU_OT_IS_VALID(drrw->drr_type))
 		return (SET_ERROR(EINVAL));
 
 	if (rwa->heal) {
 		blkptr_t *bp;
 		dmu_buf_t *dbp;
 		int flags = DB_RF_CANFAIL;
 
 		if (rwa->raw)
 			flags |= DB_RF_NO_DECRYPT;
 
 		if (rwa->byteswap) {
 			dmu_object_byteswap_t byteswap =
 			    DMU_OT_BYTESWAP(drrw->drr_type);
 			dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(rrd->abd),
 			    DRR_WRITE_PAYLOAD_SIZE(drrw));
 		}
 
 		err = dmu_buf_hold_noread(rwa->os, drrw->drr_object,
 		    drrw->drr_offset, FTAG, &dbp);
 		if (err != 0)
 			return (err);
 
 		/* Try to read the object to see if it needs healing */
 		err = dbuf_read((dmu_buf_impl_t *)dbp, NULL, flags);
 		/*
 		 * We only try to heal when dbuf_read() returns a ECKSUMs.
 		 * Other errors (even EIO) get returned to caller.
 		 * EIO indicates that the device is not present/accessible,
 		 * so writing to it will likely fail.
 		 * If the block is healthy, we don't want to overwrite it
 		 * unnecessarily.
 		 */
 		if (err != ECKSUM) {
 			dmu_buf_rele(dbp, FTAG);
 			return (err);
 		}
 		/* Make sure the on-disk block and recv record sizes match */
 		if (drrw->drr_logical_size != dbp->db_size) {
 			err = ENOTSUP;
 			dmu_buf_rele(dbp, FTAG);
 			return (err);
 		}
 		/* Get the block pointer for the corrupted block */
 		bp = dmu_buf_get_blkptr(dbp);
 		err = do_corrective_recv(rwa, drrw, rrd, bp);
 		dmu_buf_rele(dbp, FTAG);
 		return (err);
 	}
 
 	/*
 	 * For resuming to work, records must be in increasing order
 	 * by (object, offset).
 	 */
 	if (drrw->drr_object < rwa->last_object ||
 	    (drrw->drr_object == rwa->last_object &&
 	    drrw->drr_offset < rwa->last_offset)) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	struct receive_record_arg *first_rrd = list_head(&rwa->write_batch);
 	struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write;
 	uint64_t batch_size =
 	    MIN(zfs_recv_write_batch_size, DMU_MAX_ACCESS / 2);
 	if (first_rrd != NULL &&
 	    (drrw->drr_object != first_drrw->drr_object ||
 	    drrw->drr_offset >= first_drrw->drr_offset + batch_size)) {
 		err = flush_write_batch(rwa);
 		if (err != 0)
 			return (err);
 	}
 
 	rwa->last_object = drrw->drr_object;
 	rwa->last_offset = drrw->drr_offset;
 
 	if (rwa->last_object > rwa->max_object)
 		rwa->max_object = rwa->last_object;
 
 	list_insert_tail(&rwa->write_batch, rrd);
 	/*
 	 * Return EAGAIN to indicate that we will use this rrd again,
 	 * so the caller should not free it
 	 */
 	return (EAGAIN);
 }
 
 static int
 receive_write_embedded(struct receive_writer_arg *rwa,
     struct drr_write_embedded *drrwe, void *data)
 {
 	dmu_tx_t *tx;
 	int err;
 
 	if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset)
 		return (SET_ERROR(EINVAL));
 
 	if (drrwe->drr_psize > BPE_PAYLOAD_SIZE)
 		return (SET_ERROR(EINVAL));
 
 	if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES)
 		return (SET_ERROR(EINVAL));
 	if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
 		return (SET_ERROR(EINVAL));
 	if (rwa->raw)
 		return (SET_ERROR(EINVAL));
 
 	if (drrwe->drr_object > rwa->max_object)
 		rwa->max_object = drrwe->drr_object;
 
 	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_write(tx, drrwe->drr_object,
 	    drrwe->drr_offset, drrwe->drr_length);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	dmu_write_embedded(rwa->os, drrwe->drr_object,
 	    drrwe->drr_offset, data, drrwe->drr_etype,
 	    drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize,
 	    rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
 
 	/* See comment in restore_write. */
 	save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx);
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 static int
 receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
     abd_t *abd)
 {
 	dmu_buf_t *db, *db_spill;
 	int err;
 
 	if (drrs->drr_length < SPA_MINBLOCKSIZE ||
 	    drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * This is an unmodified spill block which was added to the stream
 	 * to resolve an issue with incorrectly removing spill blocks.  It
 	 * should be ignored by current versions of the code which support
 	 * the DRR_FLAG_SPILL_BLOCK flag.
 	 */
 	if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) {
 		abd_free(abd);
 		return (0);
 	}
 
 	if (rwa->raw) {
 		if (!DMU_OT_IS_VALID(drrs->drr_type) ||
 		    drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS ||
 		    drrs->drr_compressed_size == 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if (drrs->drr_object > rwa->max_object)
 		rwa->max_object = drrs->drr_object;
 
 	VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
 	if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG,
 	    &db_spill)) != 0) {
 		dmu_buf_rele(db, FTAG);
 		return (err);
 	}
 
 	dmu_tx_t *tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_spill(tx, db->db_object);
 
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_buf_rele(db, FTAG);
 		dmu_buf_rele(db_spill, FTAG);
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	/*
 	 * Spill blocks may both grow and shrink.  When a change in size
 	 * occurs any existing dbuf must be updated to match the logical
 	 * size of the provided arc_buf_t.
 	 */
 	if (db_spill->db_size != drrs->drr_length) {
 		dmu_buf_will_fill(db_spill, tx, B_FALSE);
 		VERIFY0(dbuf_spill_set_blksz(db_spill,
 		    drrs->drr_length, tx));
 	}
 
 	arc_buf_t *abuf;
 	if (rwa->raw) {
 		boolean_t byteorder = ZFS_HOST_BYTEORDER ^
 		    !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^
 		    rwa->byteswap;
 
 		abuf = arc_loan_raw_buf(dmu_objset_spa(rwa->os),
 		    drrs->drr_object, byteorder, drrs->drr_salt,
 		    drrs->drr_iv, drrs->drr_mac, drrs->drr_type,
 		    drrs->drr_compressed_size, drrs->drr_length,
 		    drrs->drr_compressiontype, 0);
 	} else {
 		abuf = arc_loan_buf(dmu_objset_spa(rwa->os),
 		    DMU_OT_IS_METADATA(drrs->drr_type),
 		    drrs->drr_length);
 		if (rwa->byteswap) {
 			dmu_object_byteswap_t byteswap =
 			    DMU_OT_BYTESWAP(drrs->drr_type);
 			dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(abd),
 			    DRR_SPILL_PAYLOAD_SIZE(drrs));
 		}
 	}
 
 	memcpy(abuf->b_data, abd_to_buf(abd), DRR_SPILL_PAYLOAD_SIZE(drrs));
 	abd_free(abd);
 	dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx);
 
 	dmu_buf_rele(db, FTAG);
 	dmu_buf_rele(db_spill, FTAG);
 
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 noinline static int
 receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
 {
 	int err;
 
 	if (drrf->drr_length != -1ULL &&
 	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
 		return (SET_ERROR(EINVAL));
 
 	if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if (drrf->drr_object > rwa->max_object)
 		rwa->max_object = drrf->drr_object;
 
 	err = dmu_free_long_range(rwa->os, drrf->drr_object,
 	    drrf->drr_offset, drrf->drr_length);
 
 	return (err);
 }
 
 static int
 receive_object_range(struct receive_writer_arg *rwa,
     struct drr_object_range *drror)
 {
 	/*
 	 * By default, we assume this block is in our native format
 	 * (ZFS_HOST_BYTEORDER). We then take into account whether
 	 * the send stream is byteswapped (rwa->byteswap). Finally,
 	 * we need to byteswap again if this particular block was
 	 * in non-native format on the send side.
 	 */
 	boolean_t byteorder = ZFS_HOST_BYTEORDER ^ rwa->byteswap ^
 	    !!DRR_IS_RAW_BYTESWAPPED(drror->drr_flags);
 
 	/*
 	 * Since dnode block sizes are constant, we should not need to worry
 	 * about making sure that the dnode block size is the same on the
 	 * sending and receiving sides for the time being. For non-raw sends,
 	 * this does not matter (and in fact we do not send a DRR_OBJECT_RANGE
 	 * record at all). Raw sends require this record type because the
 	 * encryption parameters are used to protect an entire block of bonus
 	 * buffers. If the size of dnode blocks ever becomes variable,
 	 * handling will need to be added to ensure that dnode block sizes
 	 * match on the sending and receiving side.
 	 */
 	if (drror->drr_numslots != DNODES_PER_BLOCK ||
 	    P2PHASE(drror->drr_firstobj, DNODES_PER_BLOCK) != 0 ||
 	    !rwa->raw)
 		return (SET_ERROR(EINVAL));
 
 	if (drror->drr_firstobj > rwa->max_object)
 		rwa->max_object = drror->drr_firstobj;
 
 	/*
 	 * The DRR_OBJECT_RANGE handling must be deferred to receive_object()
 	 * so that the block of dnodes is not written out when it's empty,
 	 * and converted to a HOLE BP.
 	 */
 	rwa->or_crypt_params_present = B_TRUE;
 	rwa->or_firstobj = drror->drr_firstobj;
 	rwa->or_numslots = drror->drr_numslots;
 	memcpy(rwa->or_salt, drror->drr_salt, ZIO_DATA_SALT_LEN);
 	memcpy(rwa->or_iv, drror->drr_iv, ZIO_DATA_IV_LEN);
 	memcpy(rwa->or_mac, drror->drr_mac, ZIO_DATA_MAC_LEN);
 	rwa->or_byteorder = byteorder;
 
 	rwa->or_need_sync = ORNS_MAYBE;
 
 	return (0);
 }
 
 /*
  * Until we have the ability to redact large ranges of data efficiently, we
  * process these records as frees.
  */
 noinline static int
 receive_redact(struct receive_writer_arg *rwa, struct drr_redact *drrr)
 {
 	struct drr_free drrf = {0};
 	drrf.drr_length = drrr->drr_length;
 	drrf.drr_object = drrr->drr_object;
 	drrf.drr_offset = drrr->drr_offset;
 	drrf.drr_toguid = drrr->drr_toguid;
 	return (receive_free(rwa, &drrf));
 }
 
 /* used to destroy the drc_ds on error */
 static void
 dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
 {
 	dsl_dataset_t *ds = drc->drc_ds;
 	ds_hold_flags_t dsflags;
 
 	dsflags = (drc->drc_raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
 	/*
 	 * Wait for the txg sync before cleaning up the receive. For
 	 * resumable receives, this ensures that our resume state has
 	 * been written out to disk. For raw receives, this ensures
 	 * that the user accounting code will not attempt to do anything
 	 * after we stopped receiving the dataset.
 	 */
 	txg_wait_synced(ds->ds_dir->dd_pool, 0);
 	ds->ds_objset->os_raw_receive = B_FALSE;
 
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	if (drc->drc_resumable && drc->drc_should_save &&
 	    !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) {
 		rrw_exit(&ds->ds_bp_rwlock, FTAG);
 		dsl_dataset_disown(ds, dsflags, dmu_recv_tag);
 	} else {
 		char name[ZFS_MAX_DATASET_NAME_LEN];
 		rrw_exit(&ds->ds_bp_rwlock, FTAG);
 		dsl_dataset_name(ds, name);
 		dsl_dataset_disown(ds, dsflags, dmu_recv_tag);
 		if (!drc->drc_heal)
 			(void) dsl_destroy_head(name);
 	}
 }
 
 static void
 receive_cksum(dmu_recv_cookie_t *drc, int len, void *buf)
 {
 	if (drc->drc_byteswap) {
 		(void) fletcher_4_incremental_byteswap(buf, len,
 		    &drc->drc_cksum);
 	} else {
 		(void) fletcher_4_incremental_native(buf, len, &drc->drc_cksum);
 	}
 }
 
 /*
  * Read the payload into a buffer of size len, and update the current record's
  * payload field.
  * Allocate drc->drc_next_rrd and read the next record's header into
  * drc->drc_next_rrd->header.
  * Verify checksum of payload and next record.
  */
 static int
 receive_read_payload_and_next_header(dmu_recv_cookie_t *drc, int len, void *buf)
 {
 	int err;
 
 	if (len != 0) {
 		ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
 		err = receive_read(drc, len, buf);
 		if (err != 0)
 			return (err);
 		receive_cksum(drc, len, buf);
 
 		/* note: rrd is NULL when reading the begin record's payload */
 		if (drc->drc_rrd != NULL) {
 			drc->drc_rrd->payload = buf;
 			drc->drc_rrd->payload_size = len;
 			drc->drc_rrd->bytes_read = drc->drc_bytes_read;
 		}
 	} else {
 		ASSERT3P(buf, ==, NULL);
 	}
 
 	drc->drc_prev_cksum = drc->drc_cksum;
 
 	drc->drc_next_rrd = kmem_zalloc(sizeof (*drc->drc_next_rrd), KM_SLEEP);
 	err = receive_read(drc, sizeof (drc->drc_next_rrd->header),
 	    &drc->drc_next_rrd->header);
 	drc->drc_next_rrd->bytes_read = drc->drc_bytes_read;
 
 	if (err != 0) {
 		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
 		drc->drc_next_rrd = NULL;
 		return (err);
 	}
 	if (drc->drc_next_rrd->header.drr_type == DRR_BEGIN) {
 		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
 		drc->drc_next_rrd = NULL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Note: checksum is of everything up to but not including the
 	 * checksum itself.
 	 */
 	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
 	receive_cksum(drc,
 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    &drc->drc_next_rrd->header);
 
 	zio_cksum_t cksum_orig =
 	    drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum;
 	zio_cksum_t *cksump =
 	    &drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum;
 
 	if (drc->drc_byteswap)
 		byteswap_record(&drc->drc_next_rrd->header);
 
 	if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
 	    !ZIO_CHECKSUM_EQUAL(drc->drc_cksum, *cksump)) {
 		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
 		drc->drc_next_rrd = NULL;
 		return (SET_ERROR(ECKSUM));
 	}
 
 	receive_cksum(drc, sizeof (cksum_orig), &cksum_orig);
 
 	return (0);
 }
 
 /*
  * Issue the prefetch reads for any necessary indirect blocks.
  *
  * We use the object ignore list to tell us whether or not to issue prefetches
  * for a given object.  We do this for both correctness (in case the blocksize
  * of an object has changed) and performance (if the object doesn't exist, don't
  * needlessly try to issue prefetches).  We also trim the list as we go through
  * the stream to prevent it from growing to an unbounded size.
  *
  * The object numbers within will always be in sorted order, and any write
  * records we see will also be in sorted order, but they're not sorted with
  * respect to each other (i.e. we can get several object records before
  * receiving each object's write records).  As a result, once we've reached a
  * given object number, we can safely remove any reference to lower object
  * numbers in the ignore list. In practice, we receive up to 32 object records
  * before receiving write records, so the list can have up to 32 nodes in it.
  */
 static void
 receive_read_prefetch(dmu_recv_cookie_t *drc, uint64_t object, uint64_t offset,
     uint64_t length)
 {
 	if (!objlist_exists(drc->drc_ignore_objlist, object)) {
 		dmu_prefetch(drc->drc_os, object, 1, offset, length,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
 }
 
 /*
  * Read records off the stream, issuing any necessary prefetches.
  */
 static int
 receive_read_record(dmu_recv_cookie_t *drc)
 {
 	int err;
 
 	switch (drc->drc_rrd->header.drr_type) {
 	case DRR_OBJECT:
 	{
 		struct drr_object *drro =
 		    &drc->drc_rrd->header.drr_u.drr_object;
 		uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro);
 		void *buf = NULL;
 		dmu_object_info_t doi;
 
 		if (size != 0)
 			buf = kmem_zalloc(size, KM_SLEEP);
 
 		err = receive_read_payload_and_next_header(drc, size, buf);
 		if (err != 0) {
 			kmem_free(buf, size);
 			return (err);
 		}
 		err = dmu_object_info(drc->drc_os, drro->drr_object, &doi);
 		/*
 		 * See receive_read_prefetch for an explanation why we're
 		 * storing this object in the ignore_obj_list.
 		 */
 		if (err == ENOENT || err == EEXIST ||
 		    (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
 			objlist_insert(drc->drc_ignore_objlist,
 			    drro->drr_object);
 			err = 0;
 		}
 		return (err);
 	}
 	case DRR_FREEOBJECTS:
 	{
 		err = receive_read_payload_and_next_header(drc, 0, NULL);
 		return (err);
 	}
 	case DRR_WRITE:
 	{
 		struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write;
 		int size = DRR_WRITE_PAYLOAD_SIZE(drrw);
 		abd_t *abd = abd_alloc_linear(size, B_FALSE);
 		err = receive_read_payload_and_next_header(drc, size,
 		    abd_to_buf(abd));
 		if (err != 0) {
 			abd_free(abd);
 			return (err);
 		}
 		drc->drc_rrd->abd = abd;
 		receive_read_prefetch(drc, drrw->drr_object, drrw->drr_offset,
 		    drrw->drr_logical_size);
 		return (err);
 	}
 	case DRR_WRITE_EMBEDDED:
 	{
 		struct drr_write_embedded *drrwe =
 		    &drc->drc_rrd->header.drr_u.drr_write_embedded;
 		uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
 		void *buf = kmem_zalloc(size, KM_SLEEP);
 
 		err = receive_read_payload_and_next_header(drc, size, buf);
 		if (err != 0) {
 			kmem_free(buf, size);
 			return (err);
 		}
 
 		receive_read_prefetch(drc, drrwe->drr_object, drrwe->drr_offset,
 		    drrwe->drr_length);
 		return (err);
 	}
 	case DRR_FREE:
 	case DRR_REDACT:
 	{
 		/*
 		 * It might be beneficial to prefetch indirect blocks here, but
 		 * we don't really have the data to decide for sure.
 		 */
 		err = receive_read_payload_and_next_header(drc, 0, NULL);
 		return (err);
 	}
 	case DRR_END:
 	{
 		struct drr_end *drre = &drc->drc_rrd->header.drr_u.drr_end;
 		if (!ZIO_CHECKSUM_EQUAL(drc->drc_prev_cksum,
 		    drre->drr_checksum))
 			return (SET_ERROR(ECKSUM));
 		return (0);
 	}
 	case DRR_SPILL:
 	{
 		struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill;
 		int size = DRR_SPILL_PAYLOAD_SIZE(drrs);
 		abd_t *abd = abd_alloc_linear(size, B_FALSE);
 		err = receive_read_payload_and_next_header(drc, size,
 		    abd_to_buf(abd));
 		if (err != 0)
 			abd_free(abd);
 		else
 			drc->drc_rrd->abd = abd;
 		return (err);
 	}
 	case DRR_OBJECT_RANGE:
 	{
 		err = receive_read_payload_and_next_header(drc, 0, NULL);
 		return (err);
 
 	}
 	default:
 		return (SET_ERROR(EINVAL));
 	}
 }
 
 
 
 static void
 dprintf_drr(struct receive_record_arg *rrd, int err)
 {
 #ifdef ZFS_DEBUG
 	switch (rrd->header.drr_type) {
 	case DRR_OBJECT:
 	{
 		struct drr_object *drro = &rrd->header.drr_u.drr_object;
 		dprintf("drr_type = OBJECT obj = %llu type = %u "
 		    "bonustype = %u blksz = %u bonuslen = %u cksumtype = %u "
 		    "compress = %u dn_slots = %u err = %d\n",
 		    (u_longlong_t)drro->drr_object, drro->drr_type,
 		    drro->drr_bonustype, drro->drr_blksz, drro->drr_bonuslen,
 		    drro->drr_checksumtype, drro->drr_compress,
 		    drro->drr_dn_slots, err);
 		break;
 	}
 	case DRR_FREEOBJECTS:
 	{
 		struct drr_freeobjects *drrfo =
 		    &rrd->header.drr_u.drr_freeobjects;
 		dprintf("drr_type = FREEOBJECTS firstobj = %llu "
 		    "numobjs = %llu err = %d\n",
 		    (u_longlong_t)drrfo->drr_firstobj,
 		    (u_longlong_t)drrfo->drr_numobjs, err);
 		break;
 	}
 	case DRR_WRITE:
 	{
 		struct drr_write *drrw = &rrd->header.drr_u.drr_write;
 		dprintf("drr_type = WRITE obj = %llu type = %u offset = %llu "
 		    "lsize = %llu cksumtype = %u flags = %u "
 		    "compress = %u psize = %llu err = %d\n",
 		    (u_longlong_t)drrw->drr_object, drrw->drr_type,
 		    (u_longlong_t)drrw->drr_offset,
 		    (u_longlong_t)drrw->drr_logical_size,
 		    drrw->drr_checksumtype, drrw->drr_flags,
 		    drrw->drr_compressiontype,
 		    (u_longlong_t)drrw->drr_compressed_size, err);
 		break;
 	}
 	case DRR_WRITE_BYREF:
 	{
 		struct drr_write_byref *drrwbr =
 		    &rrd->header.drr_u.drr_write_byref;
 		dprintf("drr_type = WRITE_BYREF obj = %llu offset = %llu "
 		    "length = %llu toguid = %llx refguid = %llx "
 		    "refobject = %llu refoffset = %llu cksumtype = %u "
 		    "flags = %u err = %d\n",
 		    (u_longlong_t)drrwbr->drr_object,
 		    (u_longlong_t)drrwbr->drr_offset,
 		    (u_longlong_t)drrwbr->drr_length,
 		    (u_longlong_t)drrwbr->drr_toguid,
 		    (u_longlong_t)drrwbr->drr_refguid,
 		    (u_longlong_t)drrwbr->drr_refobject,
 		    (u_longlong_t)drrwbr->drr_refoffset,
 		    drrwbr->drr_checksumtype, drrwbr->drr_flags, err);
 		break;
 	}
 	case DRR_WRITE_EMBEDDED:
 	{
 		struct drr_write_embedded *drrwe =
 		    &rrd->header.drr_u.drr_write_embedded;
 		dprintf("drr_type = WRITE_EMBEDDED obj = %llu offset = %llu "
 		    "length = %llu compress = %u etype = %u lsize = %u "
 		    "psize = %u err = %d\n",
 		    (u_longlong_t)drrwe->drr_object,
 		    (u_longlong_t)drrwe->drr_offset,
 		    (u_longlong_t)drrwe->drr_length,
 		    drrwe->drr_compression, drrwe->drr_etype,
 		    drrwe->drr_lsize, drrwe->drr_psize, err);
 		break;
 	}
 	case DRR_FREE:
 	{
 		struct drr_free *drrf = &rrd->header.drr_u.drr_free;
 		dprintf("drr_type = FREE obj = %llu offset = %llu "
 		    "length = %lld err = %d\n",
 		    (u_longlong_t)drrf->drr_object,
 		    (u_longlong_t)drrf->drr_offset,
 		    (longlong_t)drrf->drr_length,
 		    err);
 		break;
 	}
 	case DRR_SPILL:
 	{
 		struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
 		dprintf("drr_type = SPILL obj = %llu length = %llu "
 		    "err = %d\n", (u_longlong_t)drrs->drr_object,
 		    (u_longlong_t)drrs->drr_length, err);
 		break;
 	}
 	case DRR_OBJECT_RANGE:
 	{
 		struct drr_object_range *drror =
 		    &rrd->header.drr_u.drr_object_range;
 		dprintf("drr_type = OBJECT_RANGE firstobj = %llu "
 		    "numslots = %llu flags = %u err = %d\n",
 		    (u_longlong_t)drror->drr_firstobj,
 		    (u_longlong_t)drror->drr_numslots,
 		    drror->drr_flags, err);
 		break;
 	}
 	default:
 		return;
 	}
 #endif
 }
 
 /*
  * Commit the records to the pool.
  */
 static int
 receive_process_record(struct receive_writer_arg *rwa,
     struct receive_record_arg *rrd)
 {
 	int err;
 
 	/* Processing in order, therefore bytes_read should be increasing. */
 	ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
 	rwa->bytes_read = rrd->bytes_read;
 
 	/* We can only heal write records; other ones get ignored */
 	if (rwa->heal && rrd->header.drr_type != DRR_WRITE) {
 		if (rrd->abd != NULL) {
 			abd_free(rrd->abd);
 			rrd->abd = NULL;
 		} else if (rrd->payload != NULL) {
 			kmem_free(rrd->payload, rrd->payload_size);
 			rrd->payload = NULL;
 		}
 		return (0);
 	}
 
 	if (!rwa->heal && rrd->header.drr_type != DRR_WRITE) {
 		err = flush_write_batch(rwa);
 		if (err != 0) {
 			if (rrd->abd != NULL) {
 				abd_free(rrd->abd);
 				rrd->abd = NULL;
 				rrd->payload = NULL;
 			} else if (rrd->payload != NULL) {
 				kmem_free(rrd->payload, rrd->payload_size);
 				rrd->payload = NULL;
 			}
 
 			return (err);
 		}
 	}
 
 	switch (rrd->header.drr_type) {
 	case DRR_OBJECT:
 	{
 		struct drr_object *drro = &rrd->header.drr_u.drr_object;
 		err = receive_object(rwa, drro, rrd->payload);
 		kmem_free(rrd->payload, rrd->payload_size);
 		rrd->payload = NULL;
 		break;
 	}
 	case DRR_FREEOBJECTS:
 	{
 		struct drr_freeobjects *drrfo =
 		    &rrd->header.drr_u.drr_freeobjects;
 		err = receive_freeobjects(rwa, drrfo);
 		break;
 	}
 	case DRR_WRITE:
 	{
 		err = receive_process_write_record(rwa, rrd);
 		if (rwa->heal) {
 			/*
 			 * If healing - always free the abd after processing
 			 */
 			abd_free(rrd->abd);
 			rrd->abd = NULL;
 		} else if (err != EAGAIN) {
 			/*
 			 * On success, a non-healing
 			 * receive_process_write_record() returns
 			 * EAGAIN to indicate that we do not want to free
 			 * the rrd or arc_buf.
 			 */
 			ASSERT(err != 0);
 			abd_free(rrd->abd);
 			rrd->abd = NULL;
 		}
 		break;
 	}
 	case DRR_WRITE_EMBEDDED:
 	{
 		struct drr_write_embedded *drrwe =
 		    &rrd->header.drr_u.drr_write_embedded;
 		err = receive_write_embedded(rwa, drrwe, rrd->payload);
 		kmem_free(rrd->payload, rrd->payload_size);
 		rrd->payload = NULL;
 		break;
 	}
 	case DRR_FREE:
 	{
 		struct drr_free *drrf = &rrd->header.drr_u.drr_free;
 		err = receive_free(rwa, drrf);
 		break;
 	}
 	case DRR_SPILL:
 	{
 		struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
 		err = receive_spill(rwa, drrs, rrd->abd);
 		if (err != 0)
 			abd_free(rrd->abd);
 		rrd->abd = NULL;
 		rrd->payload = NULL;
 		break;
 	}
 	case DRR_OBJECT_RANGE:
 	{
 		struct drr_object_range *drror =
 		    &rrd->header.drr_u.drr_object_range;
 		err = receive_object_range(rwa, drror);
 		break;
 	}
 	case DRR_REDACT:
 	{
 		struct drr_redact *drrr = &rrd->header.drr_u.drr_redact;
 		err = receive_redact(rwa, drrr);
 		break;
 	}
 	default:
 		err = (SET_ERROR(EINVAL));
 	}
 
 	if (err != 0)
 		dprintf_drr(rrd, err);
 
 	return (err);
 }
 
 /*
  * dmu_recv_stream's worker thread; pull records off the queue, and then call
  * receive_process_record  When we're done, signal the main thread and exit.
  */
 static __attribute__((noreturn)) void
 receive_writer_thread(void *arg)
 {
 	struct receive_writer_arg *rwa = arg;
 	struct receive_record_arg *rrd;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
 	    rrd = bqueue_dequeue(&rwa->q)) {
 		/*
 		 * If there's an error, the main thread will stop putting things
 		 * on the queue, but we need to clear everything in it before we
 		 * can exit.
 		 */
 		int err = 0;
 		if (rwa->err == 0) {
 			err = receive_process_record(rwa, rrd);
 		} else if (rrd->abd != NULL) {
 			abd_free(rrd->abd);
 			rrd->abd = NULL;
 			rrd->payload = NULL;
 		} else if (rrd->payload != NULL) {
 			kmem_free(rrd->payload, rrd->payload_size);
 			rrd->payload = NULL;
 		}
 		/*
 		 * EAGAIN indicates that this record has been saved (on
 		 * raw->write_batch), and will be used again, so we don't
 		 * free it.
 		 * When healing data we always need to free the record.
 		 */
 		if (err != EAGAIN || rwa->heal) {
 			if (rwa->err == 0)
 				rwa->err = err;
 			kmem_free(rrd, sizeof (*rrd));
 		}
 	}
 	kmem_free(rrd, sizeof (*rrd));
 
 	if (rwa->heal) {
 		zio_wait(rwa->heal_pio);
 	} else {
 		int err = flush_write_batch(rwa);
 		if (rwa->err == 0)
 			rwa->err = err;
 	}
 	mutex_enter(&rwa->mutex);
 	rwa->done = B_TRUE;
 	cv_signal(&rwa->cv);
 	mutex_exit(&rwa->mutex);
 	spl_fstrans_unmark(cookie);
 	thread_exit();
 }
 
 static int
 resume_check(dmu_recv_cookie_t *drc, nvlist_t *begin_nvl)
 {
 	uint64_t val;
 	objset_t *mos = dmu_objset_pool(drc->drc_os)->dp_meta_objset;
 	uint64_t dsobj = dmu_objset_id(drc->drc_os);
 	uint64_t resume_obj, resume_off;
 
 	if (nvlist_lookup_uint64(begin_nvl,
 	    "resume_object", &resume_obj) != 0 ||
 	    nvlist_lookup_uint64(begin_nvl,
 	    "resume_offset", &resume_off) != 0) {
 		return (SET_ERROR(EINVAL));
 	}
 	VERIFY0(zap_lookup(mos, dsobj,
 	    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val));
 	if (resume_obj != val)
 		return (SET_ERROR(EINVAL));
 	VERIFY0(zap_lookup(mos, dsobj,
 	    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val));
 	if (resume_off != val)
 		return (SET_ERROR(EINVAL));
 
 	return (0);
 }
 
 /*
  * Read in the stream's records, one by one, and apply them to the pool.  There
  * are two threads involved; the thread that calls this function will spin up a
  * worker thread, read the records off the stream one by one, and issue
  * prefetches for any necessary indirect blocks.  It will then push the records
  * onto an internal blocking queue.  The worker thread will pull the records off
  * the queue, and actually write the data into the DMU.  This way, the worker
  * thread doesn't have to wait for reads to complete, since everything it needs
  * (the indirect blocks) will be prefetched.
  *
  * NB: callers *must* call dmu_recv_end() if this succeeds.
  */
 int
 dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp)
 {
 	int err = 0;
 	struct receive_writer_arg *rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP);
 
 	if (dsl_dataset_has_resume_receive_state(drc->drc_ds)) {
 		uint64_t bytes = 0;
 		(void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset,
 		    drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES,
 		    sizeof (bytes), 1, &bytes);
 		drc->drc_bytes_read += bytes;
 	}
 
 	drc->drc_ignore_objlist = objlist_create();
 
 	/* these were verified in dmu_recv_begin */
 	ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
 	    DMU_SUBSTREAM);
 	ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
 
 	ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
 	ASSERT0(drc->drc_os->os_encrypted &&
 	    (drc->drc_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA));
 
 	/* handle DSL encryption key payload */
 	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) {
 		nvlist_t *keynvl = NULL;
 
 		ASSERT(drc->drc_os->os_encrypted);
 		ASSERT(drc->drc_raw);
 
 		err = nvlist_lookup_nvlist(drc->drc_begin_nvl, "crypt_keydata",
 		    &keynvl);
 		if (err != 0)
 			goto out;
 
 		if (!drc->drc_heal) {
 			/*
 			 * If this is a new dataset we set the key immediately.
 			 * Otherwise we don't want to change the key until we
 			 * are sure the rest of the receive succeeded so we
 			 * stash the keynvl away until then.
 			 */
 			err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa),
 			    drc->drc_ds->ds_object, drc->drc_fromsnapobj,
 			    drc->drc_drrb->drr_type, keynvl, drc->drc_newfs);
 			if (err != 0)
 				goto out;
 		}
 
 		/* see comment in dmu_recv_end_sync() */
 		drc->drc_ivset_guid = 0;
 		(void) nvlist_lookup_uint64(keynvl, "to_ivset_guid",
 		    &drc->drc_ivset_guid);
 
 		if (!drc->drc_newfs)
 			drc->drc_keynvl = fnvlist_dup(keynvl);
 	}
 
 	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) {
 		err = resume_check(drc, drc->drc_begin_nvl);
 		if (err != 0)
 			goto out;
 	}
 
 	/*
 	 * For compatibility with recursive send streams, we do this here,
 	 * rather than in dmu_recv_begin. If we pull the next header too
 	 * early, and it's the END record, we break the `recv_skip` logic.
 	 */
 	if (drc->drc_drr_begin->drr_payloadlen == 0) {
 		err = receive_read_payload_and_next_header(drc, 0, NULL);
 		if (err != 0)
 			goto out;
 	}
 
 	/*
 	 * If we failed before this point we will clean up any new resume
 	 * state that was created. Now that we've gotten past the initial
 	 * checks we are ok to retain that resume state.
 	 */
 	drc->drc_should_save = B_TRUE;
 
 	(void) bqueue_init(&rwa->q, zfs_recv_queue_ff,
 	    MAX(zfs_recv_queue_length, 2 * zfs_max_recordsize),
 	    offsetof(struct receive_record_arg, node));
 	cv_init(&rwa->cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL);
 	rwa->os = drc->drc_os;
 	rwa->byteswap = drc->drc_byteswap;
 	rwa->heal = drc->drc_heal;
 	rwa->tofs = drc->drc_tofs;
 	rwa->resumable = drc->drc_resumable;
 	rwa->raw = drc->drc_raw;
 	rwa->spill = drc->drc_spill;
 	rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0);
 	rwa->os->os_raw_receive = drc->drc_raw;
 	if (drc->drc_heal) {
 		rwa->heal_pio = zio_root(drc->drc_os->os_spa, NULL, NULL,
 		    ZIO_FLAG_GODFATHER);
 	}
 	list_create(&rwa->write_batch, sizeof (struct receive_record_arg),
 	    offsetof(struct receive_record_arg, node.bqn_node));
 
 	(void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc,
 	    TS_RUN, minclsyspri);
 	/*
 	 * We're reading rwa->err without locks, which is safe since we are the
 	 * only reader, and the worker thread is the only writer.  It's ok if we
 	 * miss a write for an iteration or two of the loop, since the writer
 	 * thread will keep freeing records we send it until we send it an eos
 	 * marker.
 	 *
 	 * We can leave this loop in 3 ways:  First, if rwa->err is
 	 * non-zero.  In that case, the writer thread will free the rrd we just
 	 * pushed.  Second, if  we're interrupted; in that case, either it's the
 	 * first loop and drc->drc_rrd was never allocated, or it's later, and
 	 * drc->drc_rrd has been handed off to the writer thread who will free
 	 * it.  Finally, if receive_read_record fails or we're at the end of the
 	 * stream, then we free drc->drc_rrd and exit.
 	 */
 	while (rwa->err == 0) {
 		if (issig(JUSTLOOKING) && issig(FORREAL)) {
 			err = SET_ERROR(EINTR);
 			break;
 		}
 
 		ASSERT3P(drc->drc_rrd, ==, NULL);
 		drc->drc_rrd = drc->drc_next_rrd;
 		drc->drc_next_rrd = NULL;
 		/* Allocates and loads header into drc->drc_next_rrd */
 		err = receive_read_record(drc);
 
 		if (drc->drc_rrd->header.drr_type == DRR_END || err != 0) {
 			kmem_free(drc->drc_rrd, sizeof (*drc->drc_rrd));
 			drc->drc_rrd = NULL;
 			break;
 		}
 
 		bqueue_enqueue(&rwa->q, drc->drc_rrd,
 		    sizeof (struct receive_record_arg) +
 		    drc->drc_rrd->payload_size);
 		drc->drc_rrd = NULL;
 	}
 
 	ASSERT3P(drc->drc_rrd, ==, NULL);
 	drc->drc_rrd = kmem_zalloc(sizeof (*drc->drc_rrd), KM_SLEEP);
 	drc->drc_rrd->eos_marker = B_TRUE;
 	bqueue_enqueue_flush(&rwa->q, drc->drc_rrd, 1);
 
 	mutex_enter(&rwa->mutex);
 	while (!rwa->done) {
 		/*
 		 * We need to use cv_wait_sig() so that any process that may
 		 * be sleeping here can still fork.
 		 */
 		(void) cv_wait_sig(&rwa->cv, &rwa->mutex);
 	}
 	mutex_exit(&rwa->mutex);
 
 	/*
 	 * If we are receiving a full stream as a clone, all object IDs which
 	 * are greater than the maximum ID referenced in the stream are
 	 * by definition unused and must be freed.
 	 */
 	if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) {
 		uint64_t obj = rwa->max_object + 1;
 		int free_err = 0;
 		int next_err = 0;
 
 		while (next_err == 0) {
 			free_err = dmu_free_long_object(rwa->os, obj);
 			if (free_err != 0 && free_err != ENOENT)
 				break;
 
 			next_err = dmu_object_next(rwa->os, &obj, FALSE, 0);
 		}
 
 		if (err == 0) {
 			if (free_err != 0 && free_err != ENOENT)
 				err = free_err;
 			else if (next_err != ESRCH)
 				err = next_err;
 		}
 	}
 
 	cv_destroy(&rwa->cv);
 	mutex_destroy(&rwa->mutex);
 	bqueue_destroy(&rwa->q);
 	list_destroy(&rwa->write_batch);
 	if (err == 0)
 		err = rwa->err;
 
 out:
 	/*
 	 * If we hit an error before we started the receive_writer_thread
 	 * we need to clean up the next_rrd we create by processing the
 	 * DRR_BEGIN record.
 	 */
 	if (drc->drc_next_rrd != NULL)
 		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
 
 	/*
 	 * The objset will be invalidated by dmu_recv_end() when we do
 	 * dsl_dataset_clone_swap_sync_impl().
 	 */
 	drc->drc_os = NULL;
 
 	kmem_free(rwa, sizeof (*rwa));
 	nvlist_free(drc->drc_begin_nvl);
 
 	if (err != 0) {
 		/*
 		 * Clean up references. If receive is not resumable,
 		 * destroy what we created, so we don't leave it in
 		 * the inconsistent state.
 		 */
 		dmu_recv_cleanup_ds(drc);
 		nvlist_free(drc->drc_keynvl);
 	}
 
 	objlist_destroy(drc->drc_ignore_objlist);
 	drc->drc_ignore_objlist = NULL;
 	*voffp = drc->drc_voff;
 	return (err);
 }
 
 static int
 dmu_recv_end_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_cookie_t *drc = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	int error;
 
 	ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
 
 	if (drc->drc_heal) {
 		error = 0;
 	} else if (!drc->drc_newfs) {
 		dsl_dataset_t *origin_head;
 
 		error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
 		if (error != 0)
 			return (error);
 		if (drc->drc_force) {
 			/*
 			 * We will destroy any snapshots in tofs (i.e. before
 			 * origin_head) that are after the origin (which is
 			 * the snap before drc_ds, because drc_ds can not
 			 * have any snaps of its own).
 			 */
 			uint64_t obj;
 
 			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
 			while (obj !=
 			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
 				dsl_dataset_t *snap;
 				error = dsl_dataset_hold_obj(dp, obj, FTAG,
 				    &snap);
 				if (error != 0)
 					break;
 				if (snap->ds_dir != origin_head->ds_dir)
 					error = SET_ERROR(EINVAL);
 				if (error == 0)  {
 					error = dsl_destroy_snapshot_check_impl(
 					    snap, B_FALSE);
 				}
 				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 				dsl_dataset_rele(snap, FTAG);
 				if (error != 0)
 					break;
 			}
 			if (error != 0) {
 				dsl_dataset_rele(origin_head, FTAG);
 				return (error);
 			}
 		}
 		if (drc->drc_keynvl != NULL) {
 			error = dsl_crypto_recv_raw_key_check(drc->drc_ds,
 			    drc->drc_keynvl, tx);
 			if (error != 0) {
 				dsl_dataset_rele(origin_head, FTAG);
 				return (error);
 			}
 		}
 
 		error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
 		    origin_head, drc->drc_force, drc->drc_owner, tx);
 		if (error != 0) {
 			dsl_dataset_rele(origin_head, FTAG);
 			return (error);
 		}
 		error = dsl_dataset_snapshot_check_impl(origin_head,
 		    drc->drc_tosnap, tx, B_TRUE, 1,
 		    drc->drc_cred, drc->drc_proc);
 		dsl_dataset_rele(origin_head, FTAG);
 		if (error != 0)
 			return (error);
 
 		error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
 	} else {
 		error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
 		    drc->drc_tosnap, tx, B_TRUE, 1,
 		    drc->drc_cred, drc->drc_proc);
 	}
 	return (error);
 }
 
 static void
 dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_cookie_t *drc = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0;
 	uint64_t newsnapobj = 0;
 
 	spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
 	    tx, "snap=%s", drc->drc_tosnap);
 	drc->drc_ds->ds_objset->os_raw_receive = B_FALSE;
 
 	if (drc->drc_heal) {
 		if (drc->drc_keynvl != NULL) {
 			nvlist_free(drc->drc_keynvl);
 			drc->drc_keynvl = NULL;
 		}
 	} else if (!drc->drc_newfs) {
 		dsl_dataset_t *origin_head;
 
 		VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
 		    &origin_head));
 
 		if (drc->drc_force) {
 			/*
 			 * Destroy any snapshots of drc_tofs (origin_head)
 			 * after the origin (the snap before drc_ds).
 			 */
 			uint64_t obj;
 
 			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
 			while (obj !=
 			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
 				dsl_dataset_t *snap;
 				VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
 				    &snap));
 				ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
 				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 				dsl_destroy_snapshot_sync_impl(snap,
 				    B_FALSE, tx);
 				dsl_dataset_rele(snap, FTAG);
 			}
 		}
 		if (drc->drc_keynvl != NULL) {
 			dsl_crypto_recv_raw_key_sync(drc->drc_ds,
 			    drc->drc_keynvl, tx);
 			nvlist_free(drc->drc_keynvl);
 			drc->drc_keynvl = NULL;
 		}
 
 		VERIFY3P(drc->drc_ds->ds_prev, ==,
 		    origin_head->ds_prev);
 
 		dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
 		    origin_head, tx);
 		/*
 		 * The objset was evicted by dsl_dataset_clone_swap_sync_impl,
 		 * so drc_os is no longer valid.
 		 */
 		drc->drc_os = NULL;
 
 		dsl_dataset_snapshot_sync_impl(origin_head,
 		    drc->drc_tosnap, tx);
 
 		/* set snapshot's creation time and guid */
 		dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
 		dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
 		    drc->drc_drrb->drr_creation_time;
 		dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
 		    drc->drc_drrb->drr_toguid;
 		dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
 		    ~DS_FLAG_INCONSISTENT;
 
 		dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
 		dsl_dataset_phys(origin_head)->ds_flags &=
 		    ~DS_FLAG_INCONSISTENT;
 
 		newsnapobj =
 		    dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
 
 		dsl_dataset_rele(origin_head, FTAG);
 		dsl_destroy_head_sync_impl(drc->drc_ds, tx);
 
 		if (drc->drc_owner != NULL)
 			VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
 	} else {
 		dsl_dataset_t *ds = drc->drc_ds;
 
 		dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
 
 		/* set snapshot's creation time and guid */
 		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 		dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
 		    drc->drc_drrb->drr_creation_time;
 		dsl_dataset_phys(ds->ds_prev)->ds_guid =
 		    drc->drc_drrb->drr_toguid;
 		dsl_dataset_phys(ds->ds_prev)->ds_flags &=
 		    ~DS_FLAG_INCONSISTENT;
 
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
 		if (dsl_dataset_has_resume_receive_state(ds)) {
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_FROMGUID, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_OBJECT, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_OFFSET, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_BYTES, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_TOGUID, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_TONAME, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, tx);
 		}
 		newsnapobj =
 		    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
 	}
 
 	/*
 	 * If this is a raw receive, the crypt_keydata nvlist will include
 	 * a to_ivset_guid for us to set on the new snapshot. This value
 	 * will override the value generated by the snapshot code. However,
 	 * this value may not be present, because older implementations of
 	 * the raw send code did not include this value, and we are still
 	 * allowed to receive them if the zfs_disable_ivset_guid_check
 	 * tunable is set, in which case we will leave the newly-generated
 	 * value.
 	 */
 	if (!drc->drc_heal && drc->drc_raw && drc->drc_ivset_guid != 0) {
 		dmu_object_zapify(dp->dp_meta_objset, newsnapobj,
 		    DMU_OT_DSL_DATASET, tx);
 		VERIFY0(zap_update(dp->dp_meta_objset, newsnapobj,
 		    DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1,
 		    &drc->drc_ivset_guid, tx));
 	}
 
 	/*
 	 * Release the hold from dmu_recv_begin.  This must be done before
 	 * we return to open context, so that when we free the dataset's dnode
 	 * we can evict its bonus buffer. Since the dataset may be destroyed
 	 * at this point (and therefore won't have a valid pointer to the spa)
 	 * we release the key mapping manually here while we do have a valid
 	 * pointer, if it exists.
 	 */
 	if (!drc->drc_raw && encrypted) {
 		(void) spa_keystore_remove_mapping(dmu_tx_pool(tx)->dp_spa,
 		    drc->drc_ds->ds_object, drc->drc_ds);
 	}
 	dsl_dataset_disown(drc->drc_ds, 0, dmu_recv_tag);
 	drc->drc_ds = NULL;
 }
 
 static int dmu_recv_end_modified_blocks = 3;
 
 static int
 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 {
 #ifdef _KERNEL
 	/*
 	 * We will be destroying the ds; make sure its origin is unmounted if
 	 * necessary.
 	 */
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	dsl_dataset_name(drc->drc_ds, name);
 	zfs_destroy_unmount_origin(name);
 #endif
 
 	return (dsl_sync_task(drc->drc_tofs,
 	    dmu_recv_end_check, dmu_recv_end_sync, drc,
 	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
 }
 
 static int
 dmu_recv_new_end(dmu_recv_cookie_t *drc)
 {
 	return (dsl_sync_task(drc->drc_tofs,
 	    dmu_recv_end_check, dmu_recv_end_sync, drc,
 	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
 }
 
 int
 dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
 {
 	int error;
 
 	drc->drc_owner = owner;
 
 	if (drc->drc_newfs)
 		error = dmu_recv_new_end(drc);
 	else
 		error = dmu_recv_existing_end(drc);
 
 	if (error != 0) {
 		dmu_recv_cleanup_ds(drc);
 		nvlist_free(drc->drc_keynvl);
 	} else if (!drc->drc_heal) {
 		if (drc->drc_newfs) {
 			zvol_create_minor(drc->drc_tofs);
 		}
 		char *snapname = kmem_asprintf("%s@%s",
 		    drc->drc_tofs, drc->drc_tosnap);
 		zvol_create_minor(snapname);
 		kmem_strfree(snapname);
 	}
 	return (error);
 }
 
 /*
  * Return TRUE if this objset is currently being received into.
  */
 boolean_t
 dmu_objset_is_receiving(objset_t *os)
 {
 	return (os->os_dsl_dataset != NULL &&
 	    os->os_dsl_dataset->ds_owner == dmu_recv_tag);
 }
 
 ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, UINT, ZMOD_RW,
 	"Maximum receive queue length");
 
 ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, UINT, ZMOD_RW,
 	"Receive queue fill fraction");
 
 ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, UINT, ZMOD_RW,
 	"Maximum amount of writes to batch into one transaction");
 
 ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, best_effort_corrective, INT, ZMOD_RW,
 	"Ignore errors during corrective receive");
 /* END CSTYLED */
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 37c68528bf95..b6cc2f0a5e91 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -1,3122 +1,3122 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright 2014 HybridCluster. All rights reserved.
  * Copyright 2016 RackTop Systems.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/spa_impl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_znode.h>
 #include <zfs_fletcher.h>
 #include <sys/avl.h>
 #include <sys/ddt.h>
 #include <sys/zfs_onexit.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_recv.h>
 #include <sys/dsl_destroy.h>
 #include <sys/blkptr.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/zfeature.h>
 #include <sys/bqueue.h>
 #include <sys/zvol.h>
 #include <sys/policy.h>
 #include <sys/objlist.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
 
 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
 static int zfs_send_corrupt_data = B_FALSE;
 /*
  * This tunable controls the amount of data (measured in bytes) that will be
  * prefetched by zfs send.  If the main thread is blocking on reads that haven't
  * completed, this variable might need to be increased.  If instead the main
  * thread is issuing new reads because the prefetches have fallen out of the
  * cache, this may need to be decreased.
  */
 static uint_t zfs_send_queue_length = SPA_MAXBLOCKSIZE;
 /*
  * This tunable controls the length of the queues that zfs send worker threads
  * use to communicate.  If the send_main_thread is blocking on these queues,
  * this variable may need to be increased.  If there is a significant slowdown
  * at the start of a send as these threads consume all the available IO
  * resources, this variable may need to be decreased.
  */
 static uint_t zfs_send_no_prefetch_queue_length = 1024 * 1024;
 /*
  * These tunables control the fill fraction of the queues by zfs send.  The fill
  * fraction controls the frequency with which threads have to be cv_signaled.
  * If a lot of cpu time is being spent on cv_signal, then these should be tuned
  * down.  If the queues empty before the signalled thread can catch up, then
  * these should be tuned up.
  */
 static uint_t zfs_send_queue_ff = 20;
 static uint_t zfs_send_no_prefetch_queue_ff = 20;
 
 /*
  * Use this to override the recordsize calculation for fast zfs send estimates.
  */
 static uint_t zfs_override_estimate_recordsize = 0;
 
 /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
 static const boolean_t zfs_send_set_freerecords_bit = B_TRUE;
 
 /* Set this tunable to FALSE is disable sending unmodified spill blocks. */
 static int zfs_send_unmodified_spill_blocks = B_TRUE;
 
 static inline boolean_t
 overflow_multiply(uint64_t a, uint64_t b, uint64_t *c)
 {
 	uint64_t temp = a * b;
 	if (b != 0 && temp / b != a)
 		return (B_FALSE);
 	*c = temp;
 	return (B_TRUE);
 }
 
 struct send_thread_arg {
 	bqueue_t	q;
 	objset_t	*os;		/* Objset to traverse */
 	uint64_t	fromtxg;	/* Traverse from this txg */
 	int		flags;		/* flags to pass to traverse_dataset */
 	int		error_code;
 	boolean_t	cancel;
 	zbookmark_phys_t resume;
 	uint64_t	*num_blocks_visited;
 };
 
 struct redact_list_thread_arg {
 	boolean_t		cancel;
 	bqueue_t		q;
 	zbookmark_phys_t	resume;
 	redaction_list_t	*rl;
 	boolean_t		mark_redact;
 	int			error_code;
 	uint64_t		*num_blocks_visited;
 };
 
 struct send_merge_thread_arg {
 	bqueue_t			q;
 	objset_t			*os;
 	struct redact_list_thread_arg	*from_arg;
 	struct send_thread_arg		*to_arg;
 	struct redact_list_thread_arg	*redact_arg;
 	int				error;
 	boolean_t			cancel;
 };
 
 struct send_range {
 	boolean_t		eos_marker; /* Marks the end of the stream */
 	uint64_t		object;
 	uint64_t		start_blkid;
 	uint64_t		end_blkid;
 	bqueue_node_t		ln;
 	enum type {DATA, HOLE, OBJECT, OBJECT_RANGE, REDACT,
 	    PREVIOUSLY_REDACTED} type;
 	union {
 		struct srd {
 			dmu_object_type_t	obj_type;
 			uint32_t		datablksz; // logical size
 			uint32_t		datasz; // payload size
 			blkptr_t		bp;
 			arc_buf_t		*abuf;
 			abd_t			*abd;
 			kmutex_t		lock;
 			kcondvar_t		cv;
 			boolean_t		io_outstanding;
 			boolean_t		io_compressed;
 			int			io_err;
 		} data;
 		struct srh {
 			uint32_t		datablksz;
 		} hole;
 		struct sro {
 			/*
 			 * This is a pointer because embedding it in the
 			 * struct causes these structures to be massively larger
 			 * for all range types; this makes the code much less
 			 * memory efficient.
 			 */
 			dnode_phys_t		*dnp;
 			blkptr_t		bp;
 		} object;
 		struct srr {
 			uint32_t		datablksz;
 		} redact;
 		struct sror {
 			blkptr_t		bp;
 		} object_range;
 	} sru;
 };
 
 /*
  * The list of data whose inclusion in a send stream can be pending from
  * one call to backup_cb to another.  Multiple calls to dump_free(),
  * dump_freeobjects(), and dump_redact() can be aggregated into a single
  * DRR_FREE, DRR_FREEOBJECTS, or DRR_REDACT replay record.
  */
 typedef enum {
 	PENDING_NONE,
 	PENDING_FREE,
 	PENDING_FREEOBJECTS,
 	PENDING_REDACT
 } dmu_pendop_t;
 
 typedef struct dmu_send_cookie {
 	dmu_replay_record_t *dsc_drr;
 	dmu_send_outparams_t *dsc_dso;
 	offset_t *dsc_off;
 	objset_t *dsc_os;
 	zio_cksum_t dsc_zc;
 	uint64_t dsc_toguid;
 	uint64_t dsc_fromtxg;
 	int dsc_err;
 	dmu_pendop_t dsc_pending_op;
 	uint64_t dsc_featureflags;
 	uint64_t dsc_last_data_object;
 	uint64_t dsc_last_data_offset;
 	uint64_t dsc_resume_object;
 	uint64_t dsc_resume_offset;
 	boolean_t dsc_sent_begin;
 	boolean_t dsc_sent_end;
 } dmu_send_cookie_t;
 
 static int do_dump(dmu_send_cookie_t *dscp, struct send_range *range);
 
 static void
 range_free(struct send_range *range)
 {
 	if (range->type == OBJECT) {
 		size_t size = sizeof (dnode_phys_t) *
 		    (range->sru.object.dnp->dn_extra_slots + 1);
 		kmem_free(range->sru.object.dnp, size);
 	} else if (range->type == DATA) {
 		mutex_enter(&range->sru.data.lock);
 		while (range->sru.data.io_outstanding)
 			cv_wait(&range->sru.data.cv, &range->sru.data.lock);
 		if (range->sru.data.abd != NULL)
 			abd_free(range->sru.data.abd);
 		if (range->sru.data.abuf != NULL) {
 			arc_buf_destroy(range->sru.data.abuf,
 			    &range->sru.data.abuf);
 		}
 		mutex_exit(&range->sru.data.lock);
 
 		cv_destroy(&range->sru.data.cv);
 		mutex_destroy(&range->sru.data.lock);
 	}
 	kmem_free(range, sizeof (*range));
 }
 
 /*
  * For all record types except BEGIN, fill in the checksum (overlaid in
  * drr_u.drr_checksum.drr_checksum).  The checksum verifies everything
  * up to the start of the checksum itself.
  */
 static int
 dump_record(dmu_send_cookie_t *dscp, void *payload, int payload_len)
 {
 	dmu_send_outparams_t *dso = dscp->dsc_dso;
 	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
 	(void) fletcher_4_incremental_native(dscp->dsc_drr,
 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    &dscp->dsc_zc);
 	if (dscp->dsc_drr->drr_type == DRR_BEGIN) {
 		dscp->dsc_sent_begin = B_TRUE;
 	} else {
 		ASSERT(ZIO_CHECKSUM_IS_ZERO(&dscp->dsc_drr->drr_u.
 		    drr_checksum.drr_checksum));
 		dscp->dsc_drr->drr_u.drr_checksum.drr_checksum = dscp->dsc_zc;
 	}
 	if (dscp->dsc_drr->drr_type == DRR_END) {
 		dscp->dsc_sent_end = B_TRUE;
 	}
 	(void) fletcher_4_incremental_native(&dscp->dsc_drr->
 	    drr_u.drr_checksum.drr_checksum,
 	    sizeof (zio_cksum_t), &dscp->dsc_zc);
 	*dscp->dsc_off += sizeof (dmu_replay_record_t);
 	dscp->dsc_err = dso->dso_outfunc(dscp->dsc_os, dscp->dsc_drr,
 	    sizeof (dmu_replay_record_t), dso->dso_arg);
 	if (dscp->dsc_err != 0)
 		return (SET_ERROR(EINTR));
 	if (payload_len != 0) {
 		*dscp->dsc_off += payload_len;
 		/*
 		 * payload is null when dso_dryrun == B_TRUE (i.e. when we're
 		 * doing a send size calculation)
 		 */
 		if (payload != NULL) {
 			(void) fletcher_4_incremental_native(
 			    payload, payload_len, &dscp->dsc_zc);
 		}
 
 		/*
 		 * The code does not rely on this (len being a multiple of 8).
 		 * We keep this assertion because of the corresponding assertion
 		 * in receive_read().  Keeping this assertion ensures that we do
 		 * not inadvertently break backwards compatibility (causing the
 		 * assertion in receive_read() to trigger on old software).
 		 *
 		 * Raw sends cannot be received on old software, and so can
 		 * bypass this assertion.
 		 */
 
 		ASSERT((payload_len % 8 == 0) ||
 		    (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW));
 
 		dscp->dsc_err = dso->dso_outfunc(dscp->dsc_os, payload,
 		    payload_len, dso->dso_arg);
 		if (dscp->dsc_err != 0)
 			return (SET_ERROR(EINTR));
 	}
 	return (0);
 }
 
 /*
  * Fill in the drr_free struct, or perform aggregation if the previous record is
  * also a free record, and the two are adjacent.
  *
  * Note that we send free records even for a full send, because we want to be
  * able to receive a full send as a clone, which requires a list of all the free
  * and freeobject records that were generated on the source.
  */
 static int
 dump_free(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
     uint64_t length)
 {
 	struct drr_free *drrf = &(dscp->dsc_drr->drr_u.drr_free);
 
 	/*
 	 * When we receive a free record, dbuf_free_range() assumes
 	 * that the receiving system doesn't have any dbufs in the range
 	 * being freed.  This is always true because there is a one-record
 	 * constraint: we only send one WRITE record for any given
 	 * object,offset.  We know that the one-record constraint is
 	 * true because we always send data in increasing order by
 	 * object,offset.
 	 *
 	 * If the increasing-order constraint ever changes, we should find
 	 * another way to assert that the one-record constraint is still
 	 * satisfied.
 	 */
 	ASSERT(object > dscp->dsc_last_data_object ||
 	    (object == dscp->dsc_last_data_object &&
 	    offset > dscp->dsc_last_data_offset));
 
 	/*
 	 * If there is a pending op, but it's not PENDING_FREE, push it out,
 	 * since free block aggregation can only be done for blocks of the
 	 * same type (i.e., DRR_FREE records can only be aggregated with
 	 * other DRR_FREE records.  DRR_FREEOBJECTS records can only be
 	 * aggregated with other DRR_FREEOBJECTS records).
 	 */
 	if (dscp->dsc_pending_op != PENDING_NONE &&
 	    dscp->dsc_pending_op != PENDING_FREE) {
 		if (dump_record(dscp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dscp->dsc_pending_op = PENDING_NONE;
 	}
 
 	if (dscp->dsc_pending_op == PENDING_FREE) {
 		/*
 		 * Check to see whether this free block can be aggregated
 		 * with pending one.
 		 */
 		if (drrf->drr_object == object && drrf->drr_offset +
 		    drrf->drr_length == offset) {
 			if (offset + length < offset || length == UINT64_MAX)
 				drrf->drr_length = UINT64_MAX;
 			else
 				drrf->drr_length += length;
 			return (0);
 		} else {
 			/* not a continuation.  Push out pending record */
 			if (dump_record(dscp, NULL, 0) != 0)
 				return (SET_ERROR(EINTR));
 			dscp->dsc_pending_op = PENDING_NONE;
 		}
 	}
 	/* create a FREE record and make it pending */
 	memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
 	dscp->dsc_drr->drr_type = DRR_FREE;
 	drrf->drr_object = object;
 	drrf->drr_offset = offset;
 	if (offset + length < offset)
 		drrf->drr_length = DMU_OBJECT_END;
 	else
 		drrf->drr_length = length;
 	drrf->drr_toguid = dscp->dsc_toguid;
 	if (length == DMU_OBJECT_END) {
 		if (dump_record(dscp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 	} else {
 		dscp->dsc_pending_op = PENDING_FREE;
 	}
 
 	return (0);
 }
 
 /*
  * Fill in the drr_redact struct, or perform aggregation if the previous record
  * is also a redaction record, and the two are adjacent.
  */
 static int
 dump_redact(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
     uint64_t length)
 {
 	struct drr_redact *drrr = &dscp->dsc_drr->drr_u.drr_redact;
 
 	/*
 	 * If there is a pending op, but it's not PENDING_REDACT, push it out,
 	 * since free block aggregation can only be done for blocks of the
 	 * same type (i.e., DRR_REDACT records can only be aggregated with
 	 * other DRR_REDACT records).
 	 */
 	if (dscp->dsc_pending_op != PENDING_NONE &&
 	    dscp->dsc_pending_op != PENDING_REDACT) {
 		if (dump_record(dscp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dscp->dsc_pending_op = PENDING_NONE;
 	}
 
 	if (dscp->dsc_pending_op == PENDING_REDACT) {
 		/*
 		 * Check to see whether this redacted block can be aggregated
 		 * with pending one.
 		 */
 		if (drrr->drr_object == object && drrr->drr_offset +
 		    drrr->drr_length == offset) {
 			drrr->drr_length += length;
 			return (0);
 		} else {
 			/* not a continuation.  Push out pending record */
 			if (dump_record(dscp, NULL, 0) != 0)
 				return (SET_ERROR(EINTR));
 			dscp->dsc_pending_op = PENDING_NONE;
 		}
 	}
 	/* create a REDACT record and make it pending */
 	memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
 	dscp->dsc_drr->drr_type = DRR_REDACT;
 	drrr->drr_object = object;
 	drrr->drr_offset = offset;
 	drrr->drr_length = length;
 	drrr->drr_toguid = dscp->dsc_toguid;
 	dscp->dsc_pending_op = PENDING_REDACT;
 
 	return (0);
 }
 
 static int
 dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object,
     uint64_t offset, int lsize, int psize, const blkptr_t *bp,
     boolean_t io_compressed, void *data)
 {
 	uint64_t payload_size;
 	boolean_t raw = (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW);
 	struct drr_write *drrw = &(dscp->dsc_drr->drr_u.drr_write);
 
 	/*
 	 * We send data in increasing object, offset order.
 	 * See comment in dump_free() for details.
 	 */
 	ASSERT(object > dscp->dsc_last_data_object ||
 	    (object == dscp->dsc_last_data_object &&
 	    offset > dscp->dsc_last_data_offset));
 	dscp->dsc_last_data_object = object;
 	dscp->dsc_last_data_offset = offset + lsize - 1;
 
 	/*
 	 * If there is any kind of pending aggregation (currently either
 	 * a grouping of free objects or free blocks), push it out to
 	 * the stream, since aggregation can't be done across operations
 	 * of different types.
 	 */
 	if (dscp->dsc_pending_op != PENDING_NONE) {
 		if (dump_record(dscp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dscp->dsc_pending_op = PENDING_NONE;
 	}
 	/* write a WRITE record */
 	memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
 	dscp->dsc_drr->drr_type = DRR_WRITE;
 	drrw->drr_object = object;
 	drrw->drr_type = type;
 	drrw->drr_offset = offset;
 	drrw->drr_toguid = dscp->dsc_toguid;
 	drrw->drr_logical_size = lsize;
 
 	/* only set the compression fields if the buf is compressed or raw */
 	boolean_t compressed =
 	    (bp != NULL ? BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 	    io_compressed : lsize != psize);
 	if (raw || compressed) {
 		ASSERT(bp != NULL);
 		ASSERT(raw || dscp->dsc_featureflags &
 		    DMU_BACKUP_FEATURE_COMPRESSED);
 		ASSERT(!BP_IS_EMBEDDED(bp));
 		ASSERT3S(psize, >, 0);
 
 		if (raw) {
 			ASSERT(BP_IS_PROTECTED(bp));
 
 			/*
 			 * This is a raw protected block so we need to pass
 			 * along everything the receiving side will need to
 			 * interpret this block, including the byteswap, salt,
 			 * IV, and MAC.
 			 */
 			if (BP_SHOULD_BYTESWAP(bp))
 				drrw->drr_flags |= DRR_RAW_BYTESWAP;
 			zio_crypt_decode_params_bp(bp, drrw->drr_salt,
 			    drrw->drr_iv);
 			zio_crypt_decode_mac_bp(bp, drrw->drr_mac);
 		} else {
 			/* this is a compressed block */
 			ASSERT(dscp->dsc_featureflags &
 			    DMU_BACKUP_FEATURE_COMPRESSED);
 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
 			ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)));
 			ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF);
 			ASSERT3S(lsize, >=, psize);
 		}
 
 		/* set fields common to compressed and raw sends */
 		drrw->drr_compressiontype = BP_GET_COMPRESS(bp);
 		drrw->drr_compressed_size = psize;
 		payload_size = drrw->drr_compressed_size;
 	} else {
 		payload_size = drrw->drr_logical_size;
 	}
 
 	if (bp == NULL || BP_IS_EMBEDDED(bp) || (BP_IS_PROTECTED(bp) && !raw)) {
 		/*
 		 * There's no pre-computed checksum for partial-block writes,
 		 * embedded BP's, or encrypted BP's that are being sent as
 		 * plaintext, so (like fletcher4-checksummed blocks) userland
 		 * will have to compute a dedup-capable checksum itself.
 		 */
 		drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
 	} else {
 		drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
 		if (zio_checksum_table[drrw->drr_checksumtype].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP)
 			drrw->drr_flags |= DRR_CHECKSUM_DEDUP;
 		DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
 		DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
 		DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
 		DDK_SET_CRYPT(&drrw->drr_key, BP_IS_PROTECTED(bp));
 		drrw->drr_key.ddk_cksum = bp->blk_cksum;
 	}
 
 	if (dump_record(dscp, data, payload_size) != 0)
 		return (SET_ERROR(EINTR));
 	return (0);
 }
 
 static int
 dump_write_embedded(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
     int blksz, const blkptr_t *bp)
 {
 	char buf[BPE_PAYLOAD_SIZE];
 	struct drr_write_embedded *drrw =
 	    &(dscp->dsc_drr->drr_u.drr_write_embedded);
 
 	if (dscp->dsc_pending_op != PENDING_NONE) {
 		if (dump_record(dscp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dscp->dsc_pending_op = PENDING_NONE;
 	}
 
 	ASSERT(BP_IS_EMBEDDED(bp));
 
 	memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
 	dscp->dsc_drr->drr_type = DRR_WRITE_EMBEDDED;
 	drrw->drr_object = object;
 	drrw->drr_offset = offset;
 	drrw->drr_length = blksz;
 	drrw->drr_toguid = dscp->dsc_toguid;
 	drrw->drr_compression = BP_GET_COMPRESS(bp);
 	drrw->drr_etype = BPE_GET_ETYPE(bp);
 	drrw->drr_lsize = BPE_GET_LSIZE(bp);
 	drrw->drr_psize = BPE_GET_PSIZE(bp);
 
 	decode_embedded_bp_compressed(bp, buf);
 
 	uint32_t psize = drrw->drr_psize;
 	uint32_t rsize = P2ROUNDUP(psize, 8);
 
 	if (psize != rsize)
 		memset(buf + psize, 0, rsize - psize);
 
 	if (dump_record(dscp, buf, rsize) != 0)
 		return (SET_ERROR(EINTR));
 	return (0);
 }
 
 static int
 dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
     void *data)
 {
 	struct drr_spill *drrs = &(dscp->dsc_drr->drr_u.drr_spill);
 	uint64_t blksz = BP_GET_LSIZE(bp);
 	uint64_t payload_size = blksz;
 
 	if (dscp->dsc_pending_op != PENDING_NONE) {
 		if (dump_record(dscp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dscp->dsc_pending_op = PENDING_NONE;
 	}
 
 	/* write a SPILL record */
 	memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
 	dscp->dsc_drr->drr_type = DRR_SPILL;
 	drrs->drr_object = object;
 	drrs->drr_length = blksz;
 	drrs->drr_toguid = dscp->dsc_toguid;
 
 	/* See comment in dump_dnode() for full details */
 	if (zfs_send_unmodified_spill_blocks &&
-	    (bp->blk_birth <= dscp->dsc_fromtxg)) {
+	    (BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) {
 		drrs->drr_flags |= DRR_SPILL_UNMODIFIED;
 	}
 
 	/* handle raw send fields */
 	if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) {
 		ASSERT(BP_IS_PROTECTED(bp));
 
 		if (BP_SHOULD_BYTESWAP(bp))
 			drrs->drr_flags |= DRR_RAW_BYTESWAP;
 		drrs->drr_compressiontype = BP_GET_COMPRESS(bp);
 		drrs->drr_compressed_size = BP_GET_PSIZE(bp);
 		zio_crypt_decode_params_bp(bp, drrs->drr_salt, drrs->drr_iv);
 		zio_crypt_decode_mac_bp(bp, drrs->drr_mac);
 		payload_size = drrs->drr_compressed_size;
 	}
 
 	if (dump_record(dscp, data, payload_size) != 0)
 		return (SET_ERROR(EINTR));
 	return (0);
 }
 
 static int
 dump_freeobjects(dmu_send_cookie_t *dscp, uint64_t firstobj, uint64_t numobjs)
 {
 	struct drr_freeobjects *drrfo = &(dscp->dsc_drr->drr_u.drr_freeobjects);
 	uint64_t maxobj = DNODES_PER_BLOCK *
 	    (DMU_META_DNODE(dscp->dsc_os)->dn_maxblkid + 1);
 
 	/*
 	 * ZoL < 0.7 does not handle large FREEOBJECTS records correctly,
 	 * leading to zfs recv never completing. to avoid this issue, don't
 	 * send FREEOBJECTS records for object IDs which cannot exist on the
 	 * receiving side.
 	 */
 	if (maxobj > 0) {
 		if (maxobj <= firstobj)
 			return (0);
 
 		if (maxobj < firstobj + numobjs)
 			numobjs = maxobj - firstobj;
 	}
 
 	/*
 	 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
 	 * push it out, since free block aggregation can only be done for
 	 * blocks of the same type (i.e., DRR_FREE records can only be
 	 * aggregated with other DRR_FREE records.  DRR_FREEOBJECTS records
 	 * can only be aggregated with other DRR_FREEOBJECTS records).
 	 */
 	if (dscp->dsc_pending_op != PENDING_NONE &&
 	    dscp->dsc_pending_op != PENDING_FREEOBJECTS) {
 		if (dump_record(dscp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dscp->dsc_pending_op = PENDING_NONE;
 	}
 
 	if (dscp->dsc_pending_op == PENDING_FREEOBJECTS) {
 		/*
 		 * See whether this free object array can be aggregated
 		 * with pending one
 		 */
 		if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
 			drrfo->drr_numobjs += numobjs;
 			return (0);
 		} else {
 			/* can't be aggregated.  Push out pending record */
 			if (dump_record(dscp, NULL, 0) != 0)
 				return (SET_ERROR(EINTR));
 			dscp->dsc_pending_op = PENDING_NONE;
 		}
 	}
 
 	/* write a FREEOBJECTS record */
 	memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
 	dscp->dsc_drr->drr_type = DRR_FREEOBJECTS;
 	drrfo->drr_firstobj = firstobj;
 	drrfo->drr_numobjs = numobjs;
 	drrfo->drr_toguid = dscp->dsc_toguid;
 
 	dscp->dsc_pending_op = PENDING_FREEOBJECTS;
 
 	return (0);
 }
 
 static int
 dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
     dnode_phys_t *dnp)
 {
 	struct drr_object *drro = &(dscp->dsc_drr->drr_u.drr_object);
 	int bonuslen;
 
 	if (object < dscp->dsc_resume_object) {
 		/*
 		 * Note: when resuming, we will visit all the dnodes in
 		 * the block of dnodes that we are resuming from.  In
 		 * this case it's unnecessary to send the dnodes prior to
 		 * the one we are resuming from.  We should be at most one
 		 * block's worth of dnodes behind the resume point.
 		 */
 		ASSERT3U(dscp->dsc_resume_object - object, <,
 		    1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT));
 		return (0);
 	}
 
 	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
 		return (dump_freeobjects(dscp, object, 1));
 
 	if (dscp->dsc_pending_op != PENDING_NONE) {
 		if (dump_record(dscp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dscp->dsc_pending_op = PENDING_NONE;
 	}
 
 	/* write an OBJECT record */
 	memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
 	dscp->dsc_drr->drr_type = DRR_OBJECT;
 	drro->drr_object = object;
 	drro->drr_type = dnp->dn_type;
 	drro->drr_bonustype = dnp->dn_bonustype;
 	drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 	drro->drr_bonuslen = dnp->dn_bonuslen;
 	drro->drr_dn_slots = dnp->dn_extra_slots + 1;
 	drro->drr_checksumtype = dnp->dn_checksum;
 	drro->drr_compress = dnp->dn_compress;
 	drro->drr_toguid = dscp->dsc_toguid;
 
 	if (!(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
 	    drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
 		drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
 
 	bonuslen = P2ROUNDUP(dnp->dn_bonuslen, 8);
 
 	if ((dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)) {
 		ASSERT(BP_IS_ENCRYPTED(bp));
 
 		if (BP_SHOULD_BYTESWAP(bp))
 			drro->drr_flags |= DRR_RAW_BYTESWAP;
 
 		/* needed for reconstructing dnp on recv side */
 		drro->drr_maxblkid = dnp->dn_maxblkid;
 		drro->drr_indblkshift = dnp->dn_indblkshift;
 		drro->drr_nlevels = dnp->dn_nlevels;
 		drro->drr_nblkptr = dnp->dn_nblkptr;
 
 		/*
 		 * Since we encrypt the entire bonus area, the (raw) part
 		 * beyond the bonuslen is actually nonzero, so we need
 		 * to send it.
 		 */
 		if (bonuslen != 0) {
 			if (drro->drr_bonuslen > DN_MAX_BONUS_LEN(dnp))
 				return (SET_ERROR(EINVAL));
 			drro->drr_raw_bonuslen = DN_MAX_BONUS_LEN(dnp);
 			bonuslen = drro->drr_raw_bonuslen;
 		}
 	}
 
 	/*
 	 * DRR_OBJECT_SPILL is set for every dnode which references a
 	 * spill block.	 This allows the receiving pool to definitively
 	 * determine when a spill block should be kept or freed.
 	 */
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
 		drro->drr_flags |= DRR_OBJECT_SPILL;
 
 	if (dump_record(dscp, DN_BONUS(dnp), bonuslen) != 0)
 		return (SET_ERROR(EINTR));
 
 	/* Free anything past the end of the file. */
 	if (dump_free(dscp, object, (dnp->dn_maxblkid + 1) *
 	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0)
 		return (SET_ERROR(EINTR));
 
 	/*
 	 * Send DRR_SPILL records for unmodified spill blocks.	This is useful
 	 * because changing certain attributes of the object (e.g. blocksize)
 	 * can cause old versions of ZFS to incorrectly remove a spill block.
 	 * Including these records in the stream forces an up to date version
 	 * to always be written ensuring they're never lost.  Current versions
 	 * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
 	 * ignore these unmodified spill blocks.
 	 */
 	if (zfs_send_unmodified_spill_blocks &&
 	    (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
-	    (DN_SPILL_BLKPTR(dnp)->blk_birth <= dscp->dsc_fromtxg)) {
+	    (BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) {
 		struct send_range record;
 		blkptr_t *bp = DN_SPILL_BLKPTR(dnp);
 
 		memset(&record, 0, sizeof (struct send_range));
 		record.type = DATA;
 		record.object = object;
 		record.eos_marker = B_FALSE;
 		record.start_blkid = DMU_SPILL_BLKID;
 		record.end_blkid = record.start_blkid + 1;
 		record.sru.data.bp = *bp;
 		record.sru.data.obj_type = dnp->dn_type;
 		record.sru.data.datablksz = BP_GET_LSIZE(bp);
 
 		if (do_dump(dscp, &record) != 0)
 			return (SET_ERROR(EINTR));
 	}
 
 	if (dscp->dsc_err != 0)
 		return (SET_ERROR(EINTR));
 
 	return (0);
 }
 
 static int
 dump_object_range(dmu_send_cookie_t *dscp, const blkptr_t *bp,
     uint64_t firstobj, uint64_t numslots)
 {
 	struct drr_object_range *drror =
 	    &(dscp->dsc_drr->drr_u.drr_object_range);
 
 	/* we only use this record type for raw sends */
 	ASSERT(BP_IS_PROTECTED(bp));
 	ASSERT(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW);
 	ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 	ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE);
 	ASSERT0(BP_GET_LEVEL(bp));
 
 	if (dscp->dsc_pending_op != PENDING_NONE) {
 		if (dump_record(dscp, NULL, 0) != 0)
 			return (SET_ERROR(EINTR));
 		dscp->dsc_pending_op = PENDING_NONE;
 	}
 
 	memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
 	dscp->dsc_drr->drr_type = DRR_OBJECT_RANGE;
 	drror->drr_firstobj = firstobj;
 	drror->drr_numslots = numslots;
 	drror->drr_toguid = dscp->dsc_toguid;
 	if (BP_SHOULD_BYTESWAP(bp))
 		drror->drr_flags |= DRR_RAW_BYTESWAP;
 	zio_crypt_decode_params_bp(bp, drror->drr_salt, drror->drr_iv);
 	zio_crypt_decode_mac_bp(bp, drror->drr_mac);
 
 	if (dump_record(dscp, NULL, 0) != 0)
 		return (SET_ERROR(EINTR));
 	return (0);
 }
 
 static boolean_t
 send_do_embed(const blkptr_t *bp, uint64_t featureflags)
 {
 	if (!BP_IS_EMBEDDED(bp))
 		return (B_FALSE);
 
 	/*
 	 * Compression function must be legacy, or explicitly enabled.
 	 */
 	if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
 	    !(featureflags & DMU_BACKUP_FEATURE_LZ4)))
 		return (B_FALSE);
 
 	/*
 	 * If we have not set the ZSTD feature flag, we can't send ZSTD
 	 * compressed embedded blocks, as the receiver may not support them.
 	 */
 	if ((BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD &&
 	    !(featureflags & DMU_BACKUP_FEATURE_ZSTD)))
 		return (B_FALSE);
 
 	/*
 	 * Embed type must be explicitly enabled.
 	 */
 	switch (BPE_GET_ETYPE(bp)) {
 	case BP_EMBEDDED_TYPE_DATA:
 		if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
 			return (B_TRUE);
 		break;
 	default:
 		return (B_FALSE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * This function actually handles figuring out what kind of record needs to be
  * dumped, and calling the appropriate helper function.  In most cases,
  * the data has already been read by send_reader_thread().
  */
 static int
 do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
 {
 	int err = 0;
 	switch (range->type) {
 	case OBJECT:
 		err = dump_dnode(dscp, &range->sru.object.bp, range->object,
 		    range->sru.object.dnp);
 		return (err);
 	case OBJECT_RANGE: {
 		ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
 		if (!(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)) {
 			return (0);
 		}
 		uint64_t epb = BP_GET_LSIZE(&range->sru.object_range.bp) >>
 		    DNODE_SHIFT;
 		uint64_t firstobj = range->start_blkid * epb;
 		err = dump_object_range(dscp, &range->sru.object_range.bp,
 		    firstobj, epb);
 		break;
 	}
 	case REDACT: {
 		struct srr *srrp = &range->sru.redact;
 		err = dump_redact(dscp, range->object, range->start_blkid *
 		    srrp->datablksz, (range->end_blkid - range->start_blkid) *
 		    srrp->datablksz);
 		return (err);
 	}
 	case DATA: {
 		struct srd *srdp = &range->sru.data;
 		blkptr_t *bp = &srdp->bp;
 		spa_t *spa =
 		    dmu_objset_spa(dscp->dsc_os);
 
 		ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp));
 		ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
 		if (BP_GET_TYPE(bp) == DMU_OT_SA) {
 			arc_flags_t aflags = ARC_FLAG_WAIT;
 			zio_flag_t zioflags = ZIO_FLAG_CANFAIL;
 
 			if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) {
 				ASSERT(BP_IS_PROTECTED(bp));
 				zioflags |= ZIO_FLAG_RAW;
 			}
 
 			zbookmark_phys_t zb;
 			ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID);
 			zb.zb_objset = dmu_objset_id(dscp->dsc_os);
 			zb.zb_object = range->object;
 			zb.zb_level = 0;
 			zb.zb_blkid = range->start_blkid;
 
 			arc_buf_t *abuf = NULL;
 			if (!dscp->dsc_dso->dso_dryrun && arc_read(NULL, spa,
 			    bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
 			    zioflags, &aflags, &zb) != 0)
 				return (SET_ERROR(EIO));
 
 			err = dump_spill(dscp, bp, zb.zb_object,
 			    (abuf == NULL ? NULL : abuf->b_data));
 			if (abuf != NULL)
 				arc_buf_destroy(abuf, &abuf);
 			return (err);
 		}
 		if (send_do_embed(bp, dscp->dsc_featureflags)) {
 			err = dump_write_embedded(dscp, range->object,
 			    range->start_blkid * srdp->datablksz,
 			    srdp->datablksz, bp);
 			return (err);
 		}
 		ASSERT(range->object > dscp->dsc_resume_object ||
 		    (range->object == dscp->dsc_resume_object &&
 		    range->start_blkid * srdp->datablksz >=
 		    dscp->dsc_resume_offset));
 		/* it's a level-0 block of a regular object */
 
 		mutex_enter(&srdp->lock);
 		while (srdp->io_outstanding)
 			cv_wait(&srdp->cv, &srdp->lock);
 		err = srdp->io_err;
 		mutex_exit(&srdp->lock);
 
 		if (err != 0) {
 			if (zfs_send_corrupt_data &&
 			    !dscp->dsc_dso->dso_dryrun) {
 				/*
 				 * Send a block filled with 0x"zfs badd bloc"
 				 */
 				srdp->abuf = arc_alloc_buf(spa, &srdp->abuf,
 				    ARC_BUFC_DATA, srdp->datablksz);
 				uint64_t *ptr;
 				for (ptr = srdp->abuf->b_data;
 				    (char *)ptr < (char *)srdp->abuf->b_data +
 				    srdp->datablksz; ptr++)
 					*ptr = 0x2f5baddb10cULL;
 			} else {
 				return (SET_ERROR(EIO));
 			}
 		}
 
 		ASSERT(dscp->dsc_dso->dso_dryrun ||
 		    srdp->abuf != NULL || srdp->abd != NULL);
 
 		uint64_t offset = range->start_blkid * srdp->datablksz;
 
 		char *data = NULL;
 		if (srdp->abd != NULL) {
 			data = abd_to_buf(srdp->abd);
 			ASSERT3P(srdp->abuf, ==, NULL);
 		} else if (srdp->abuf != NULL) {
 			data = srdp->abuf->b_data;
 		}
 
 		/*
 		 * If we have large blocks stored on disk but the send flags
 		 * don't allow us to send large blocks, we split the data from
 		 * the arc buf into chunks.
 		 */
 		if (srdp->datablksz > SPA_OLD_MAXBLOCKSIZE &&
 		    !(dscp->dsc_featureflags &
 		    DMU_BACKUP_FEATURE_LARGE_BLOCKS)) {
 			while (srdp->datablksz > 0 && err == 0) {
 				int n = MIN(srdp->datablksz,
 				    SPA_OLD_MAXBLOCKSIZE);
 				err = dmu_dump_write(dscp, srdp->obj_type,
 				    range->object, offset, n, n, NULL, B_FALSE,
 				    data);
 				offset += n;
 				/*
 				 * When doing dry run, data==NULL is used as a
 				 * sentinel value by
 				 * dmu_dump_write()->dump_record().
 				 */
 				if (data != NULL)
 					data += n;
 				srdp->datablksz -= n;
 			}
 		} else {
 			err = dmu_dump_write(dscp, srdp->obj_type,
 			    range->object, offset,
 			    srdp->datablksz, srdp->datasz, bp,
 			    srdp->io_compressed, data);
 		}
 		return (err);
 	}
 	case HOLE: {
 		struct srh *srhp = &range->sru.hole;
 		if (range->object == DMU_META_DNODE_OBJECT) {
 			uint32_t span = srhp->datablksz >> DNODE_SHIFT;
 			uint64_t first_obj = range->start_blkid * span;
 			uint64_t numobj = range->end_blkid * span - first_obj;
 			return (dump_freeobjects(dscp, first_obj, numobj));
 		}
 		uint64_t offset = 0;
 
 		/*
 		 * If this multiply overflows, we don't need to send this block.
 		 * Even if it has a birth time, it can never not be a hole, so
 		 * we don't need to send records for it.
 		 */
 		if (!overflow_multiply(range->start_blkid, srhp->datablksz,
 		    &offset)) {
 			return (0);
 		}
 		uint64_t len = 0;
 
 		if (!overflow_multiply(range->end_blkid, srhp->datablksz, &len))
 			len = UINT64_MAX;
 		len = len - offset;
 		return (dump_free(dscp, range->object, offset, len));
 	}
 	default:
 		panic("Invalid range type in do_dump: %d", range->type);
 	}
 	return (err);
 }
 
 static struct send_range *
 range_alloc(enum type type, uint64_t object, uint64_t start_blkid,
     uint64_t end_blkid, boolean_t eos)
 {
 	struct send_range *range = kmem_alloc(sizeof (*range), KM_SLEEP);
 	range->type = type;
 	range->object = object;
 	range->start_blkid = start_blkid;
 	range->end_blkid = end_blkid;
 	range->eos_marker = eos;
 	if (type == DATA) {
 		range->sru.data.abd = NULL;
 		range->sru.data.abuf = NULL;
 		mutex_init(&range->sru.data.lock, NULL, MUTEX_DEFAULT, NULL);
 		cv_init(&range->sru.data.cv, NULL, CV_DEFAULT, NULL);
 		range->sru.data.io_outstanding = 0;
 		range->sru.data.io_err = 0;
 		range->sru.data.io_compressed = B_FALSE;
 	}
 	return (range);
 }
 
 /*
  * This is the callback function to traverse_dataset that acts as a worker
  * thread for dmu_send_impl.
  */
 static int
 send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
 {
 	(void) zilog;
 	struct send_thread_arg *sta = arg;
 	struct send_range *record;
 
 	ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
 	    zb->zb_object >= sta->resume.zb_object);
 
 	/*
 	 * All bps of an encrypted os should have the encryption bit set.
 	 * If this is not true it indicates tampering and we report an error.
 	 */
 	if (sta->os->os_encrypted &&
 	    !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
-		spa_log_error(spa, zb, &bp->blk_birth);
+		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
 		return (SET_ERROR(EIO));
 	}
 
 	if (sta->cancel)
 		return (SET_ERROR(EINTR));
 	if (zb->zb_object != DMU_META_DNODE_OBJECT &&
 	    DMU_OBJECT_IS_SPECIAL(zb->zb_object))
 		return (0);
 	atomic_inc_64(sta->num_blocks_visited);
 
 	if (zb->zb_level == ZB_DNODE_LEVEL) {
 		if (zb->zb_object == DMU_META_DNODE_OBJECT)
 			return (0);
 		record = range_alloc(OBJECT, zb->zb_object, 0, 0, B_FALSE);
 		record->sru.object.bp = *bp;
 		size_t size  = sizeof (*dnp) * (dnp->dn_extra_slots + 1);
 		record->sru.object.dnp = kmem_alloc(size, KM_SLEEP);
 		memcpy(record->sru.object.dnp, dnp, size);
 		bqueue_enqueue(&sta->q, record, sizeof (*record));
 		return (0);
 	}
 	if (zb->zb_level == 0 && zb->zb_object == DMU_META_DNODE_OBJECT &&
 	    !BP_IS_HOLE(bp)) {
 		record = range_alloc(OBJECT_RANGE, 0, zb->zb_blkid,
 		    zb->zb_blkid + 1, B_FALSE);
 		record->sru.object_range.bp = *bp;
 		bqueue_enqueue(&sta->q, record, sizeof (*record));
 		return (0);
 	}
 	if (zb->zb_level < 0 || (zb->zb_level > 0 && !BP_IS_HOLE(bp)))
 		return (0);
 	if (zb->zb_object == DMU_META_DNODE_OBJECT && !BP_IS_HOLE(bp))
 		return (0);
 
 	uint64_t span = bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level);
 	uint64_t start;
 
 	/*
 	 * If this multiply overflows, we don't need to send this block.
 	 * Even if it has a birth time, it can never not be a hole, so
 	 * we don't need to send records for it.
 	 */
 	if (!overflow_multiply(span, zb->zb_blkid, &start) || (!(zb->zb_blkid ==
 	    DMU_SPILL_BLKID || DMU_OT_IS_METADATA(dnp->dn_type)) &&
 	    span * zb->zb_blkid > dnp->dn_maxblkid)) {
 		ASSERT(BP_IS_HOLE(bp));
 		return (0);
 	}
 
 	if (zb->zb_blkid == DMU_SPILL_BLKID)
 		ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_SA);
 
 	enum type record_type = DATA;
 	if (BP_IS_HOLE(bp))
 		record_type = HOLE;
 	else if (BP_IS_REDACTED(bp))
 		record_type = REDACT;
 	else
 		record_type = DATA;
 
 	record = range_alloc(record_type, zb->zb_object, start,
 	    (start + span < start ? 0 : start + span), B_FALSE);
 
 	uint64_t datablksz = (zb->zb_blkid == DMU_SPILL_BLKID ?
 	    BP_GET_LSIZE(bp) : dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 
 	if (BP_IS_HOLE(bp)) {
 		record->sru.hole.datablksz = datablksz;
 	} else if (BP_IS_REDACTED(bp)) {
 		record->sru.redact.datablksz = datablksz;
 	} else {
 		record->sru.data.datablksz = datablksz;
 		record->sru.data.obj_type = dnp->dn_type;
 		record->sru.data.bp = *bp;
 	}
 
 	bqueue_enqueue(&sta->q, record, sizeof (*record));
 	return (0);
 }
 
 struct redact_list_cb_arg {
 	uint64_t *num_blocks_visited;
 	bqueue_t *q;
 	boolean_t *cancel;
 	boolean_t mark_redact;
 };
 
 static int
 redact_list_cb(redact_block_phys_t *rb, void *arg)
 {
 	struct redact_list_cb_arg *rlcap = arg;
 
 	atomic_inc_64(rlcap->num_blocks_visited);
 	if (*rlcap->cancel)
 		return (-1);
 
 	struct send_range *data = range_alloc(REDACT, rb->rbp_object,
 	    rb->rbp_blkid, rb->rbp_blkid + redact_block_get_count(rb), B_FALSE);
 	ASSERT3U(data->end_blkid, >, rb->rbp_blkid);
 	if (rlcap->mark_redact) {
 		data->type = REDACT;
 		data->sru.redact.datablksz = redact_block_get_size(rb);
 	} else {
 		data->type = PREVIOUSLY_REDACTED;
 	}
 	bqueue_enqueue(rlcap->q, data, sizeof (*data));
 
 	return (0);
 }
 
 /*
  * This function kicks off the traverse_dataset.  It also handles setting the
  * error code of the thread in case something goes wrong, and pushes the End of
  * Stream record when the traverse_dataset call has finished.
  */
 static __attribute__((noreturn)) void
 send_traverse_thread(void *arg)
 {
 	struct send_thread_arg *st_arg = arg;
 	int err = 0;
 	struct send_range *data;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	err = traverse_dataset_resume(st_arg->os->os_dsl_dataset,
 	    st_arg->fromtxg, &st_arg->resume,
 	    st_arg->flags, send_cb, st_arg);
 
 	if (err != EINTR)
 		st_arg->error_code = err;
 	data = range_alloc(DATA, 0, 0, 0, B_TRUE);
 	bqueue_enqueue_flush(&st_arg->q, data, sizeof (*data));
 	spl_fstrans_unmark(cookie);
 	thread_exit();
 }
 
 /*
  * Utility function that causes End of Stream records to compare after of all
  * others, so that other threads' comparison logic can stay simple.
  */
 static int __attribute__((unused))
 send_range_after(const struct send_range *from, const struct send_range *to)
 {
 	if (from->eos_marker == B_TRUE)
 		return (1);
 	if (to->eos_marker == B_TRUE)
 		return (-1);
 
 	uint64_t from_obj = from->object;
 	uint64_t from_end_obj = from->object + 1;
 	uint64_t to_obj = to->object;
 	uint64_t to_end_obj = to->object + 1;
 	if (from_obj == 0) {
 		ASSERT(from->type == HOLE || from->type == OBJECT_RANGE);
 		from_obj = from->start_blkid << DNODES_PER_BLOCK_SHIFT;
 		from_end_obj = from->end_blkid << DNODES_PER_BLOCK_SHIFT;
 	}
 	if (to_obj == 0) {
 		ASSERT(to->type == HOLE || to->type == OBJECT_RANGE);
 		to_obj = to->start_blkid << DNODES_PER_BLOCK_SHIFT;
 		to_end_obj = to->end_blkid << DNODES_PER_BLOCK_SHIFT;
 	}
 
 	if (from_end_obj <= to_obj)
 		return (-1);
 	if (from_obj >= to_end_obj)
 		return (1);
 	int64_t cmp = TREE_CMP(to->type == OBJECT_RANGE, from->type ==
 	    OBJECT_RANGE);
 	if (unlikely(cmp))
 		return (cmp);
 	cmp = TREE_CMP(to->type == OBJECT, from->type == OBJECT);
 	if (unlikely(cmp))
 		return (cmp);
 	if (from->end_blkid <= to->start_blkid)
 		return (-1);
 	if (from->start_blkid >= to->end_blkid)
 		return (1);
 	return (0);
 }
 
 /*
  * Pop the new data off the queue, check that the records we receive are in
  * the right order, but do not free the old data.  This is used so that the
  * records can be sent on to the main thread without copying the data.
  */
 static struct send_range *
 get_next_range_nofree(bqueue_t *bq, struct send_range *prev)
 {
 	struct send_range *next = bqueue_dequeue(bq);
 	ASSERT3S(send_range_after(prev, next), ==, -1);
 	return (next);
 }
 
 /*
  * Pop the new data off the queue, check that the records we receive are in
  * the right order, and free the old data.
  */
 static struct send_range *
 get_next_range(bqueue_t *bq, struct send_range *prev)
 {
 	struct send_range *next = get_next_range_nofree(bq, prev);
 	range_free(prev);
 	return (next);
 }
 
 static __attribute__((noreturn)) void
 redact_list_thread(void *arg)
 {
 	struct redact_list_thread_arg *rlt_arg = arg;
 	struct send_range *record;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	if (rlt_arg->rl != NULL) {
 		struct redact_list_cb_arg rlcba = {0};
 		rlcba.cancel = &rlt_arg->cancel;
 		rlcba.q = &rlt_arg->q;
 		rlcba.num_blocks_visited = rlt_arg->num_blocks_visited;
 		rlcba.mark_redact = rlt_arg->mark_redact;
 		int err = dsl_redaction_list_traverse(rlt_arg->rl,
 		    &rlt_arg->resume, redact_list_cb, &rlcba);
 		if (err != EINTR)
 			rlt_arg->error_code = err;
 	}
 	record = range_alloc(DATA, 0, 0, 0, B_TRUE);
 	bqueue_enqueue_flush(&rlt_arg->q, record, sizeof (*record));
 	spl_fstrans_unmark(cookie);
 
 	thread_exit();
 }
 
 /*
  * Compare the start point of the two provided ranges. End of stream ranges
  * compare last, objects compare before any data or hole inside that object and
  * multi-object holes that start at the same object.
  */
 static int
 send_range_start_compare(struct send_range *r1, struct send_range *r2)
 {
 	uint64_t r1_objequiv = r1->object;
 	uint64_t r1_l0equiv = r1->start_blkid;
 	uint64_t r2_objequiv = r2->object;
 	uint64_t r2_l0equiv = r2->start_blkid;
 	int64_t cmp = TREE_CMP(r1->eos_marker, r2->eos_marker);
 	if (unlikely(cmp))
 		return (cmp);
 	if (r1->object == 0) {
 		r1_objequiv = r1->start_blkid * DNODES_PER_BLOCK;
 		r1_l0equiv = 0;
 	}
 	if (r2->object == 0) {
 		r2_objequiv = r2->start_blkid * DNODES_PER_BLOCK;
 		r2_l0equiv = 0;
 	}
 
 	cmp = TREE_CMP(r1_objequiv, r2_objequiv);
 	if (likely(cmp))
 		return (cmp);
 	cmp = TREE_CMP(r2->type == OBJECT_RANGE, r1->type == OBJECT_RANGE);
 	if (unlikely(cmp))
 		return (cmp);
 	cmp = TREE_CMP(r2->type == OBJECT, r1->type == OBJECT);
 	if (unlikely(cmp))
 		return (cmp);
 
 	return (TREE_CMP(r1_l0equiv, r2_l0equiv));
 }
 
 enum q_idx {
 	REDACT_IDX = 0,
 	TO_IDX,
 	FROM_IDX,
 	NUM_THREADS
 };
 
 /*
  * This function returns the next range the send_merge_thread should operate on.
  * The inputs are two arrays; the first one stores the range at the front of the
  * queues stored in the second one.  The ranges are sorted in descending
  * priority order; the metadata from earlier ranges overrules metadata from
  * later ranges.  out_mask is used to return which threads the ranges came from;
  * bit i is set if ranges[i] started at the same place as the returned range.
  *
  * This code is not hardcoded to compare a specific number of threads; it could
  * be used with any number, just by changing the q_idx enum.
  *
  * The "next range" is the one with the earliest start; if two starts are equal,
  * the highest-priority range is the next to operate on.  If a higher-priority
  * range starts in the middle of the first range, then the first range will be
  * truncated to end where the higher-priority range starts, and we will operate
  * on that one next time.   In this way, we make sure that each block covered by
  * some range gets covered by a returned range, and each block covered is
  * returned using the metadata of the highest-priority range it appears in.
  *
  * For example, if the three ranges at the front of the queues were [2,4),
  * [3,5), and [1,3), then the ranges returned would be [1,2) with the metadata
  * from the third range, [2,4) with the metadata from the first range, and then
  * [4,5) with the metadata from the second.
  */
 static struct send_range *
 find_next_range(struct send_range **ranges, bqueue_t **qs, uint64_t *out_mask)
 {
 	int idx = 0; // index of the range with the earliest start
 	int i;
 	uint64_t bmask = 0;
 	for (i = 1; i < NUM_THREADS; i++) {
 		if (send_range_start_compare(ranges[i], ranges[idx]) < 0)
 			idx = i;
 	}
 	if (ranges[idx]->eos_marker) {
 		struct send_range *ret = range_alloc(DATA, 0, 0, 0, B_TRUE);
 		*out_mask = 0;
 		return (ret);
 	}
 	/*
 	 * Find all the ranges that start at that same point.
 	 */
 	for (i = 0; i < NUM_THREADS; i++) {
 		if (send_range_start_compare(ranges[i], ranges[idx]) == 0)
 			bmask |= 1 << i;
 	}
 	*out_mask = bmask;
 	/*
 	 * OBJECT_RANGE records only come from the TO thread, and should always
 	 * be treated as overlapping with nothing and sent on immediately.  They
 	 * are only used in raw sends, and are never redacted.
 	 */
 	if (ranges[idx]->type == OBJECT_RANGE) {
 		ASSERT3U(idx, ==, TO_IDX);
 		ASSERT3U(*out_mask, ==, 1 << TO_IDX);
 		struct send_range *ret = ranges[idx];
 		ranges[idx] = get_next_range_nofree(qs[idx], ranges[idx]);
 		return (ret);
 	}
 	/*
 	 * Find the first start or end point after the start of the first range.
 	 */
 	uint64_t first_change = ranges[idx]->end_blkid;
 	for (i = 0; i < NUM_THREADS; i++) {
 		if (i == idx || ranges[i]->eos_marker ||
 		    ranges[i]->object > ranges[idx]->object ||
 		    ranges[i]->object == DMU_META_DNODE_OBJECT)
 			continue;
 		ASSERT3U(ranges[i]->object, ==, ranges[idx]->object);
 		if (first_change > ranges[i]->start_blkid &&
 		    (bmask & (1 << i)) == 0)
 			first_change = ranges[i]->start_blkid;
 		else if (first_change > ranges[i]->end_blkid)
 			first_change = ranges[i]->end_blkid;
 	}
 	/*
 	 * Update all ranges to no longer overlap with the range we're
 	 * returning. All such ranges must start at the same place as the range
 	 * being returned, and end at or after first_change. Thus we update
 	 * their start to first_change. If that makes them size 0, then free
 	 * them and pull a new range from that thread.
 	 */
 	for (i = 0; i < NUM_THREADS; i++) {
 		if (i == idx || (bmask & (1 << i)) == 0)
 			continue;
 		ASSERT3U(first_change, >, ranges[i]->start_blkid);
 		ranges[i]->start_blkid = first_change;
 		ASSERT3U(ranges[i]->start_blkid, <=, ranges[i]->end_blkid);
 		if (ranges[i]->start_blkid == ranges[i]->end_blkid)
 			ranges[i] = get_next_range(qs[i], ranges[i]);
 	}
 	/*
 	 * Short-circuit the simple case; if the range doesn't overlap with
 	 * anything else, or it only overlaps with things that start at the same
 	 * place and are longer, send it on.
 	 */
 	if (first_change == ranges[idx]->end_blkid) {
 		struct send_range *ret = ranges[idx];
 		ranges[idx] = get_next_range_nofree(qs[idx], ranges[idx]);
 		return (ret);
 	}
 
 	/*
 	 * Otherwise, return a truncated copy of ranges[idx] and move the start
 	 * of ranges[idx] back to first_change.
 	 */
 	struct send_range *ret = kmem_alloc(sizeof (*ret), KM_SLEEP);
 	*ret = *ranges[idx];
 	ret->end_blkid = first_change;
 	ranges[idx]->start_blkid = first_change;
 	return (ret);
 }
 
 #define	FROM_AND_REDACT_BITS ((1 << REDACT_IDX) | (1 << FROM_IDX))
 
 /*
  * Merge the results from the from thread and the to thread, and then hand the
  * records off to send_prefetch_thread to prefetch them.  If this is not a
  * send from a redaction bookmark, the from thread will push an end of stream
  * record and stop, and we'll just send everything that was changed in the
  * to_ds since the ancestor's creation txg. If it is, then since
  * traverse_dataset has a canonical order, we can compare each change as
  * they're pulled off the queues.  That will give us a stream that is
  * appropriately sorted, and covers all records.  In addition, we pull the
  * data from the redact_list_thread and use that to determine which blocks
  * should be redacted.
  */
 static __attribute__((noreturn)) void
 send_merge_thread(void *arg)
 {
 	struct send_merge_thread_arg *smt_arg = arg;
 	struct send_range *front_ranges[NUM_THREADS];
 	bqueue_t *queues[NUM_THREADS];
 	int err = 0;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	if (smt_arg->redact_arg == NULL) {
 		front_ranges[REDACT_IDX] =
 		    kmem_zalloc(sizeof (struct send_range), KM_SLEEP);
 		front_ranges[REDACT_IDX]->eos_marker = B_TRUE;
 		front_ranges[REDACT_IDX]->type = REDACT;
 		queues[REDACT_IDX] = NULL;
 	} else {
 		front_ranges[REDACT_IDX] =
 		    bqueue_dequeue(&smt_arg->redact_arg->q);
 		queues[REDACT_IDX] = &smt_arg->redact_arg->q;
 	}
 	front_ranges[TO_IDX] = bqueue_dequeue(&smt_arg->to_arg->q);
 	queues[TO_IDX] = &smt_arg->to_arg->q;
 	front_ranges[FROM_IDX] = bqueue_dequeue(&smt_arg->from_arg->q);
 	queues[FROM_IDX] = &smt_arg->from_arg->q;
 	uint64_t mask = 0;
 	struct send_range *range;
 	for (range = find_next_range(front_ranges, queues, &mask);
 	    !range->eos_marker && err == 0 && !smt_arg->cancel;
 	    range = find_next_range(front_ranges, queues, &mask)) {
 		/*
 		 * If the range in question was in both the from redact bookmark
 		 * and the bookmark we're using to redact, then don't send it.
 		 * It's already redacted on the receiving system, so a redaction
 		 * record would be redundant.
 		 */
 		if ((mask & FROM_AND_REDACT_BITS) == FROM_AND_REDACT_BITS) {
 			ASSERT3U(range->type, ==, REDACT);
 			range_free(range);
 			continue;
 		}
 		bqueue_enqueue(&smt_arg->q, range, sizeof (*range));
 
 		if (smt_arg->to_arg->error_code != 0) {
 			err = smt_arg->to_arg->error_code;
 		} else if (smt_arg->from_arg->error_code != 0) {
 			err = smt_arg->from_arg->error_code;
 		} else if (smt_arg->redact_arg != NULL &&
 		    smt_arg->redact_arg->error_code != 0) {
 			err = smt_arg->redact_arg->error_code;
 		}
 	}
 	if (smt_arg->cancel && err == 0)
 		err = SET_ERROR(EINTR);
 	smt_arg->error = err;
 	if (smt_arg->error != 0) {
 		smt_arg->to_arg->cancel = B_TRUE;
 		smt_arg->from_arg->cancel = B_TRUE;
 		if (smt_arg->redact_arg != NULL)
 			smt_arg->redact_arg->cancel = B_TRUE;
 	}
 	for (int i = 0; i < NUM_THREADS; i++) {
 		while (!front_ranges[i]->eos_marker) {
 			front_ranges[i] = get_next_range(queues[i],
 			    front_ranges[i]);
 		}
 		range_free(front_ranges[i]);
 	}
 	range->eos_marker = B_TRUE;
 	bqueue_enqueue_flush(&smt_arg->q, range, 1);
 	spl_fstrans_unmark(cookie);
 	thread_exit();
 }
 
 struct send_reader_thread_arg {
 	struct send_merge_thread_arg *smta;
 	bqueue_t q;
 	boolean_t cancel;
 	boolean_t issue_reads;
 	uint64_t featureflags;
 	int error;
 };
 
 static void
 dmu_send_read_done(zio_t *zio)
 {
 	struct send_range *range = zio->io_private;
 
 	mutex_enter(&range->sru.data.lock);
 	if (zio->io_error != 0) {
 		abd_free(range->sru.data.abd);
 		range->sru.data.abd = NULL;
 		range->sru.data.io_err = zio->io_error;
 	}
 
 	ASSERT(range->sru.data.io_outstanding);
 	range->sru.data.io_outstanding = B_FALSE;
 	cv_broadcast(&range->sru.data.cv);
 	mutex_exit(&range->sru.data.lock);
 }
 
 static void
 issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range)
 {
 	struct srd *srdp = &range->sru.data;
 	blkptr_t *bp = &srdp->bp;
 	objset_t *os = srta->smta->os;
 
 	ASSERT3U(range->type, ==, DATA);
 	ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
 	/*
 	 * If we have large blocks stored on disk but
 	 * the send flags don't allow us to send large
 	 * blocks, we split the data from the arc buf
 	 * into chunks.
 	 */
 	boolean_t split_large_blocks =
 	    srdp->datablksz > SPA_OLD_MAXBLOCKSIZE &&
 	    !(srta->featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS);
 	/*
 	 * We should only request compressed data from the ARC if all
 	 * the following are true:
 	 *  - stream compression was requested
 	 *  - we aren't splitting large blocks into smaller chunks
 	 *  - the data won't need to be byteswapped before sending
 	 *  - this isn't an embedded block
 	 *  - this isn't metadata (if receiving on a different endian
 	 *    system it can be byteswapped more easily)
 	 */
 	boolean_t request_compressed =
 	    (srta->featureflags & DMU_BACKUP_FEATURE_COMPRESSED) &&
 	    !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) &&
 	    !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp));
 
 	zio_flag_t zioflags = ZIO_FLAG_CANFAIL;
 
 	if (srta->featureflags & DMU_BACKUP_FEATURE_RAW) {
 		zioflags |= ZIO_FLAG_RAW;
 		srdp->io_compressed = B_TRUE;
 	} else if (request_compressed) {
 		zioflags |= ZIO_FLAG_RAW_COMPRESS;
 		srdp->io_compressed = B_TRUE;
 	}
 
 	srdp->datasz = (zioflags & ZIO_FLAG_RAW_COMPRESS) ?
 	    BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp);
 
 	if (!srta->issue_reads)
 		return;
 	if (BP_IS_REDACTED(bp))
 		return;
 	if (send_do_embed(bp, srta->featureflags))
 		return;
 
 	zbookmark_phys_t zb = {
 	    .zb_objset = dmu_objset_id(os),
 	    .zb_object = range->object,
 	    .zb_level = 0,
 	    .zb_blkid = range->start_blkid,
 	};
 
 	arc_flags_t aflags = ARC_FLAG_CACHED_ONLY;
 
 	int arc_err = arc_read(NULL, os->os_spa, bp,
 	    arc_getbuf_func, &srdp->abuf, ZIO_PRIORITY_ASYNC_READ,
 	    zioflags, &aflags, &zb);
 	/*
 	 * If the data is not already cached in the ARC, we read directly
 	 * from zio.  This avoids the performance overhead of adding a new
 	 * entry to the ARC, and we also avoid polluting the ARC cache with
 	 * data that is not likely to be used in the future.
 	 */
 	if (arc_err != 0) {
 		srdp->abd = abd_alloc_linear(srdp->datasz, B_FALSE);
 		srdp->io_outstanding = B_TRUE;
 		zio_nowait(zio_read(NULL, os->os_spa, bp, srdp->abd,
 		    srdp->datasz, dmu_send_read_done, range,
 		    ZIO_PRIORITY_ASYNC_READ, zioflags, &zb));
 	}
 }
 
 /*
  * Create a new record with the given values.
  */
 static void
 enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn,
     uint64_t blkid, uint64_t count, const blkptr_t *bp, uint32_t datablksz)
 {
 	enum type range_type = (bp == NULL || BP_IS_HOLE(bp) ? HOLE :
 	    (BP_IS_REDACTED(bp) ? REDACT : DATA));
 
 	struct send_range *range = range_alloc(range_type, dn->dn_object,
 	    blkid, blkid + count, B_FALSE);
 
 	if (blkid == DMU_SPILL_BLKID) {
 		ASSERT3P(bp, !=, NULL);
 		ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_SA);
 	}
 
 	switch (range_type) {
 	case HOLE:
 		range->sru.hole.datablksz = datablksz;
 		break;
 	case DATA:
 		ASSERT3U(count, ==, 1);
 		range->sru.data.datablksz = datablksz;
 		range->sru.data.obj_type = dn->dn_type;
 		range->sru.data.bp = *bp;
 		issue_data_read(srta, range);
 		break;
 	case REDACT:
 		range->sru.redact.datablksz = datablksz;
 		break;
 	default:
 		break;
 	}
 	bqueue_enqueue(q, range, datablksz);
 }
 
 /*
  * This thread is responsible for two things: First, it retrieves the correct
  * blkptr in the to ds if we need to send the data because of something from
  * the from thread.  As a result of this, we're the first ones to discover that
  * some indirect blocks can be discarded because they're not holes. Second,
  * it issues prefetches for the data we need to send.
  */
 static __attribute__((noreturn)) void
 send_reader_thread(void *arg)
 {
 	struct send_reader_thread_arg *srta = arg;
 	struct send_merge_thread_arg *smta = srta->smta;
 	bqueue_t *inq = &smta->q;
 	bqueue_t *outq = &srta->q;
 	objset_t *os = smta->os;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	struct send_range *range = bqueue_dequeue(inq);
 	int err = 0;
 
 	/*
 	 * If the record we're analyzing is from a redaction bookmark from the
 	 * fromds, then we need to know whether or not it exists in the tods so
 	 * we know whether to create records for it or not. If it does, we need
 	 * the datablksz so we can generate an appropriate record for it.
 	 * Finally, if it isn't redacted, we need the blkptr so that we can send
 	 * a WRITE record containing the actual data.
 	 */
 	uint64_t last_obj = UINT64_MAX;
 	uint64_t last_obj_exists = B_TRUE;
 	while (!range->eos_marker && !srta->cancel && smta->error == 0 &&
 	    err == 0) {
 		switch (range->type) {
 		case DATA:
 			issue_data_read(srta, range);
 			bqueue_enqueue(outq, range, range->sru.data.datablksz);
 			range = get_next_range_nofree(inq, range);
 			break;
 		case HOLE:
 		case OBJECT:
 		case OBJECT_RANGE:
 		case REDACT: // Redacted blocks must exist
 			bqueue_enqueue(outq, range, sizeof (*range));
 			range = get_next_range_nofree(inq, range);
 			break;
 		case PREVIOUSLY_REDACTED: {
 			/*
 			 * This entry came from the "from bookmark" when
 			 * sending from a bookmark that has a redaction
 			 * list.  We need to check if this object/blkid
 			 * exists in the target ("to") dataset, and if
 			 * not then we drop this entry.  We also need
 			 * to fill in the block pointer so that we know
 			 * what to prefetch.
 			 *
 			 * To accomplish the above, we first cache whether or
 			 * not the last object we examined exists.  If it
 			 * doesn't, we can drop this record. If it does, we hold
 			 * the dnode and use it to call dbuf_dnode_findbp. We do
 			 * this instead of dbuf_bookmark_findbp because we will
 			 * often operate on large ranges, and holding the dnode
 			 * once is more efficient.
 			 */
 			boolean_t object_exists = B_TRUE;
 			/*
 			 * If the data is redacted, we only care if it exists,
 			 * so that we don't send records for objects that have
 			 * been deleted.
 			 */
 			dnode_t *dn;
 			if (range->object == last_obj && !last_obj_exists) {
 				/*
 				 * If we're still examining the same object as
 				 * previously, and it doesn't exist, we don't
 				 * need to call dbuf_bookmark_findbp.
 				 */
 				object_exists = B_FALSE;
 			} else {
 				err = dnode_hold(os, range->object, FTAG, &dn);
 				if (err == ENOENT) {
 					object_exists = B_FALSE;
 					err = 0;
 				}
 				last_obj = range->object;
 				last_obj_exists = object_exists;
 			}
 
 			if (err != 0) {
 				break;
 			} else if (!object_exists) {
 				/*
 				 * The block was modified, but doesn't
 				 * exist in the to dataset; if it was
 				 * deleted in the to dataset, then we'll
 				 * visit the hole bp for it at some point.
 				 */
 				range = get_next_range(inq, range);
 				continue;
 			}
 			uint64_t file_max =
 			    MIN(dn->dn_maxblkid, range->end_blkid);
 			/*
 			 * The object exists, so we need to try to find the
 			 * blkptr for each block in the range we're processing.
 			 */
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			for (uint64_t blkid = range->start_blkid;
 			    blkid < file_max; blkid++) {
 				blkptr_t bp;
 				uint32_t datablksz =
 				    dn->dn_phys->dn_datablkszsec <<
 				    SPA_MINBLOCKSHIFT;
 				uint64_t offset = blkid * datablksz;
 				/*
 				 * This call finds the next non-hole block in
 				 * the object. This is to prevent a
 				 * performance problem where we're unredacting
 				 * a large hole. Using dnode_next_offset to
 				 * skip over the large hole avoids iterating
 				 * over every block in it.
 				 */
 				err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
 				    &offset, 1, 1, 0);
 				if (err == ESRCH) {
 					offset = UINT64_MAX;
 					err = 0;
 				} else if (err != 0) {
 					break;
 				}
 				if (offset != blkid * datablksz) {
 					/*
 					 * if there is a hole from here
 					 * (blkid) to offset
 					 */
 					offset = MIN(offset, file_max *
 					    datablksz);
 					uint64_t nblks = (offset / datablksz) -
 					    blkid;
 					enqueue_range(srta, outq, dn, blkid,
 					    nblks, NULL, datablksz);
 					blkid += nblks;
 				}
 				if (blkid >= file_max)
 					break;
 				err = dbuf_dnode_findbp(dn, 0, blkid, &bp,
 				    NULL, NULL);
 				if (err != 0)
 					break;
 				ASSERT(!BP_IS_HOLE(&bp));
 				enqueue_range(srta, outq, dn, blkid, 1, &bp,
 				    datablksz);
 			}
 			rw_exit(&dn->dn_struct_rwlock);
 			dnode_rele(dn, FTAG);
 			range = get_next_range(inq, range);
 		}
 		}
 	}
 	if (srta->cancel || err != 0) {
 		smta->cancel = B_TRUE;
 		srta->error = err;
 	} else if (smta->error != 0) {
 		srta->error = smta->error;
 	}
 	while (!range->eos_marker)
 		range = get_next_range(inq, range);
 
 	bqueue_enqueue_flush(outq, range, 1);
 	spl_fstrans_unmark(cookie);
 	thread_exit();
 }
 
 #define	NUM_SNAPS_NOT_REDACTED UINT64_MAX
 
 struct dmu_send_params {
 	/* Pool args */
 	const void *tag; // Tag dp was held with, will be used to release dp.
 	dsl_pool_t *dp;
 	/* To snapshot args */
 	const char *tosnap;
 	dsl_dataset_t *to_ds;
 	/* From snapshot args */
 	zfs_bookmark_phys_t ancestor_zb;
 	uint64_t *fromredactsnaps;
 	/* NUM_SNAPS_NOT_REDACTED if not sending from redaction bookmark */
 	uint64_t numfromredactsnaps;
 	/* Stream params */
 	boolean_t is_clone;
 	boolean_t embedok;
 	boolean_t large_block_ok;
 	boolean_t compressok;
 	boolean_t rawok;
 	boolean_t savedok;
 	uint64_t resumeobj;
 	uint64_t resumeoff;
 	uint64_t saved_guid;
 	zfs_bookmark_phys_t *redactbook;
 	/* Stream output params */
 	dmu_send_outparams_t *dso;
 
 	/* Stream progress params */
 	offset_t *off;
 	int outfd;
 	char saved_toname[MAXNAMELEN];
 };
 
 static int
 setup_featureflags(struct dmu_send_params *dspp, objset_t *os,
     uint64_t *featureflags)
 {
 	dsl_dataset_t *to_ds = dspp->to_ds;
 	dsl_pool_t *dp = dspp->dp;
 
 	if (dmu_objset_type(os) == DMU_OST_ZFS) {
 		uint64_t version;
 		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0)
 			return (SET_ERROR(EINVAL));
 
 		if (version >= ZPL_VERSION_SA)
 			*featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
 	}
 
 	/* raw sends imply large_block_ok */
 	if ((dspp->rawok || dspp->large_block_ok) &&
 	    dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_BLOCKS)) {
 		*featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
 	}
 
 	/* encrypted datasets will not have embedded blocks */
 	if ((dspp->embedok || dspp->rawok) && !os->os_encrypted &&
 	    spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
 		*featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
 	}
 
 	/* raw send implies compressok */
 	if (dspp->compressok || dspp->rawok)
 		*featureflags |= DMU_BACKUP_FEATURE_COMPRESSED;
 
 	if (dspp->rawok && os->os_encrypted)
 		*featureflags |= DMU_BACKUP_FEATURE_RAW;
 
 	if ((*featureflags &
 	    (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED |
 	    DMU_BACKUP_FEATURE_RAW)) != 0 &&
 	    spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) {
 		*featureflags |= DMU_BACKUP_FEATURE_LZ4;
 	}
 
 	/*
 	 * We specifically do not include DMU_BACKUP_FEATURE_EMBED_DATA here to
 	 * allow sending ZSTD compressed datasets to a receiver that does not
 	 * support ZSTD
 	 */
 	if ((*featureflags &
 	    (DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_RAW)) != 0 &&
 	    dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_ZSTD_COMPRESS)) {
 		*featureflags |= DMU_BACKUP_FEATURE_ZSTD;
 	}
 
 	if (dspp->resumeobj != 0 || dspp->resumeoff != 0) {
 		*featureflags |= DMU_BACKUP_FEATURE_RESUMING;
 	}
 
 	if (dspp->redactbook != NULL) {
 		*featureflags |= DMU_BACKUP_FEATURE_REDACTED;
 	}
 
 	if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_DNODE)) {
 		*featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE;
 	}
 	return (0);
 }
 
 static dmu_replay_record_t *
 create_begin_record(struct dmu_send_params *dspp, objset_t *os,
     uint64_t featureflags)
 {
 	dmu_replay_record_t *drr = kmem_zalloc(sizeof (dmu_replay_record_t),
 	    KM_SLEEP);
 	drr->drr_type = DRR_BEGIN;
 
 	struct drr_begin *drrb = &drr->drr_u.drr_begin;
 	dsl_dataset_t *to_ds = dspp->to_ds;
 
 	drrb->drr_magic = DMU_BACKUP_MAGIC;
 	drrb->drr_creation_time = dsl_dataset_phys(to_ds)->ds_creation_time;
 	drrb->drr_type = dmu_objset_type(os);
 	drrb->drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
 	drrb->drr_fromguid = dspp->ancestor_zb.zbm_guid;
 
 	DMU_SET_STREAM_HDRTYPE(drrb->drr_versioninfo, DMU_SUBSTREAM);
 	DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, featureflags);
 
 	if (dspp->is_clone)
 		drrb->drr_flags |= DRR_FLAG_CLONE;
 	if (dsl_dataset_phys(dspp->to_ds)->ds_flags & DS_FLAG_CI_DATASET)
 		drrb->drr_flags |= DRR_FLAG_CI_DATA;
 	if (zfs_send_set_freerecords_bit)
 		drrb->drr_flags |= DRR_FLAG_FREERECORDS;
 	drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_SPILL_BLOCK;
 
 	if (dspp->savedok) {
 		drrb->drr_toguid = dspp->saved_guid;
 		strlcpy(drrb->drr_toname, dspp->saved_toname,
 		    sizeof (drrb->drr_toname));
 	} else {
 		dsl_dataset_name(to_ds, drrb->drr_toname);
 		if (!to_ds->ds_is_snapshot) {
 			(void) strlcat(drrb->drr_toname, "@--head--",
 			    sizeof (drrb->drr_toname));
 		}
 	}
 	return (drr);
 }
 
 static void
 setup_to_thread(struct send_thread_arg *to_arg, objset_t *to_os,
     dmu_sendstatus_t *dssp, uint64_t fromtxg, boolean_t rawok)
 {
 	VERIFY0(bqueue_init(&to_arg->q, zfs_send_no_prefetch_queue_ff,
 	    MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize),
 	    offsetof(struct send_range, ln)));
 	to_arg->error_code = 0;
 	to_arg->cancel = B_FALSE;
 	to_arg->os = to_os;
 	to_arg->fromtxg = fromtxg;
 	to_arg->flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA;
 	if (rawok)
 		to_arg->flags |= TRAVERSE_NO_DECRYPT;
 	if (zfs_send_corrupt_data)
 		to_arg->flags |= TRAVERSE_HARD;
 	to_arg->num_blocks_visited = &dssp->dss_blocks;
 	(void) thread_create(NULL, 0, send_traverse_thread, to_arg, 0,
 	    curproc, TS_RUN, minclsyspri);
 }
 
 static void
 setup_from_thread(struct redact_list_thread_arg *from_arg,
     redaction_list_t *from_rl, dmu_sendstatus_t *dssp)
 {
 	VERIFY0(bqueue_init(&from_arg->q, zfs_send_no_prefetch_queue_ff,
 	    MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize),
 	    offsetof(struct send_range, ln)));
 	from_arg->error_code = 0;
 	from_arg->cancel = B_FALSE;
 	from_arg->rl = from_rl;
 	from_arg->mark_redact = B_FALSE;
 	from_arg->num_blocks_visited = &dssp->dss_blocks;
 	/*
 	 * If from_ds is null, send_traverse_thread just returns success and
 	 * enqueues an eos marker.
 	 */
 	(void) thread_create(NULL, 0, redact_list_thread, from_arg, 0,
 	    curproc, TS_RUN, minclsyspri);
 }
 
 static void
 setup_redact_list_thread(struct redact_list_thread_arg *rlt_arg,
     struct dmu_send_params *dspp, redaction_list_t *rl, dmu_sendstatus_t *dssp)
 {
 	if (dspp->redactbook == NULL)
 		return;
 
 	rlt_arg->cancel = B_FALSE;
 	VERIFY0(bqueue_init(&rlt_arg->q, zfs_send_no_prefetch_queue_ff,
 	    MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize),
 	    offsetof(struct send_range, ln)));
 	rlt_arg->error_code = 0;
 	rlt_arg->mark_redact = B_TRUE;
 	rlt_arg->rl = rl;
 	rlt_arg->num_blocks_visited = &dssp->dss_blocks;
 
 	(void) thread_create(NULL, 0, redact_list_thread, rlt_arg, 0,
 	    curproc, TS_RUN, minclsyspri);
 }
 
 static void
 setup_merge_thread(struct send_merge_thread_arg *smt_arg,
     struct dmu_send_params *dspp, struct redact_list_thread_arg *from_arg,
     struct send_thread_arg *to_arg, struct redact_list_thread_arg *rlt_arg,
     objset_t *os)
 {
 	VERIFY0(bqueue_init(&smt_arg->q, zfs_send_no_prefetch_queue_ff,
 	    MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize),
 	    offsetof(struct send_range, ln)));
 	smt_arg->cancel = B_FALSE;
 	smt_arg->error = 0;
 	smt_arg->from_arg = from_arg;
 	smt_arg->to_arg = to_arg;
 	if (dspp->redactbook != NULL)
 		smt_arg->redact_arg = rlt_arg;
 
 	smt_arg->os = os;
 	(void) thread_create(NULL, 0, send_merge_thread, smt_arg, 0, curproc,
 	    TS_RUN, minclsyspri);
 }
 
 static void
 setup_reader_thread(struct send_reader_thread_arg *srt_arg,
     struct dmu_send_params *dspp, struct send_merge_thread_arg *smt_arg,
     uint64_t featureflags)
 {
 	VERIFY0(bqueue_init(&srt_arg->q, zfs_send_queue_ff,
 	    MAX(zfs_send_queue_length, 2 * zfs_max_recordsize),
 	    offsetof(struct send_range, ln)));
 	srt_arg->smta = smt_arg;
 	srt_arg->issue_reads = !dspp->dso->dso_dryrun;
 	srt_arg->featureflags = featureflags;
 	(void) thread_create(NULL, 0, send_reader_thread, srt_arg, 0,
 	    curproc, TS_RUN, minclsyspri);
 }
 
 static int
 setup_resume_points(struct dmu_send_params *dspp,
     struct send_thread_arg *to_arg, struct redact_list_thread_arg *from_arg,
     struct redact_list_thread_arg *rlt_arg,
     struct send_merge_thread_arg *smt_arg, boolean_t resuming, objset_t *os,
     redaction_list_t *redact_rl, nvlist_t *nvl)
 {
 	(void) smt_arg;
 	dsl_dataset_t *to_ds = dspp->to_ds;
 	int err = 0;
 
 	uint64_t obj = 0;
 	uint64_t blkid = 0;
 	if (resuming) {
 		obj = dspp->resumeobj;
 		dmu_object_info_t to_doi;
 		err = dmu_object_info(os, obj, &to_doi);
 		if (err != 0)
 			return (err);
 
 		blkid = dspp->resumeoff / to_doi.doi_data_block_size;
 	}
 	/*
 	 * If we're resuming a redacted send, we can skip to the appropriate
 	 * point in the redaction bookmark by binary searching through it.
 	 */
 	if (redact_rl != NULL) {
 		SET_BOOKMARK(&rlt_arg->resume, to_ds->ds_object, obj, 0, blkid);
 	}
 
 	SET_BOOKMARK(&to_arg->resume, to_ds->ds_object, obj, 0, blkid);
 	if (nvlist_exists(nvl, BEGINNV_REDACT_FROM_SNAPS)) {
 		uint64_t objset = dspp->ancestor_zb.zbm_redaction_obj;
 		/*
 		 * Note: If the resume point is in an object whose
 		 * blocksize is different in the from vs to snapshots,
 		 * we will have divided by the "wrong" blocksize.
 		 * However, in this case fromsnap's send_cb() will
 		 * detect that the blocksize has changed and therefore
 		 * ignore this object.
 		 *
 		 * If we're resuming a send from a redaction bookmark,
 		 * we still cannot accidentally suggest blocks behind
 		 * the to_ds.  In addition, we know that any blocks in
 		 * the object in the to_ds will have to be sent, since
 		 * the size changed.  Therefore, we can't cause any harm
 		 * this way either.
 		 */
 		SET_BOOKMARK(&from_arg->resume, objset, obj, 0, blkid);
 	}
 	if (resuming) {
 		fnvlist_add_uint64(nvl, BEGINNV_RESUME_OBJECT, dspp->resumeobj);
 		fnvlist_add_uint64(nvl, BEGINNV_RESUME_OFFSET, dspp->resumeoff);
 	}
 	return (0);
 }
 
 static dmu_sendstatus_t *
 setup_send_progress(struct dmu_send_params *dspp)
 {
 	dmu_sendstatus_t *dssp = kmem_zalloc(sizeof (*dssp), KM_SLEEP);
 	dssp->dss_outfd = dspp->outfd;
 	dssp->dss_off = dspp->off;
 	dssp->dss_proc = curproc;
 	mutex_enter(&dspp->to_ds->ds_sendstream_lock);
 	list_insert_head(&dspp->to_ds->ds_sendstreams, dssp);
 	mutex_exit(&dspp->to_ds->ds_sendstream_lock);
 	return (dssp);
 }
 
 /*
  * Actually do the bulk of the work in a zfs send.
  *
  * The idea is that we want to do a send from ancestor_zb to to_ds.  We also
  * want to not send any data that has been modified by all the datasets in
  * redactsnaparr, and store the list of blocks that are redacted in this way in
  * a bookmark named redactbook, created on the to_ds.  We do this by creating
  * several worker threads, whose function is described below.
  *
  * There are three cases.
  * The first case is a redacted zfs send.  In this case there are 5 threads.
  * The first thread is the to_ds traversal thread: it calls dataset_traverse on
  * the to_ds and finds all the blocks that have changed since ancestor_zb (if
  * it's a full send, that's all blocks in the dataset).  It then sends those
  * blocks on to the send merge thread. The redact list thread takes the data
  * from the redaction bookmark and sends those blocks on to the send merge
  * thread.  The send merge thread takes the data from the to_ds traversal
  * thread, and combines it with the redaction records from the redact list
  * thread.  If a block appears in both the to_ds's data and the redaction data,
  * the send merge thread will mark it as redacted and send it on to the prefetch
  * thread.  Otherwise, the send merge thread will send the block on to the
  * prefetch thread unchanged. The prefetch thread will issue prefetch reads for
  * any data that isn't redacted, and then send the data on to the main thread.
  * The main thread behaves the same as in a normal send case, issuing demand
  * reads for data blocks and sending out records over the network
  *
  * The graphic below diagrams the flow of data in the case of a redacted zfs
  * send.  Each box represents a thread, and each line represents the flow of
  * data.
  *
  *             Records from the |
  *           redaction bookmark |
  * +--------------------+       |  +---------------------------+
  * |                    |       v  | Send Merge Thread         |
  * | Redact List Thread +----------> Apply redaction marks to  |
  * |                    |          | records as specified by   |
  * +--------------------+          | redaction ranges          |
  *                                 +----^---------------+------+
  *                                      |               | Merged data
  *                                      |               |
  *                                      |  +------------v--------+
  *                                      |  | Prefetch Thread     |
  * +--------------------+               |  | Issues prefetch     |
  * | to_ds Traversal    |               |  | reads of data blocks|
  * | Thread (finds      +---------------+  +------------+--------+
  * | candidate blocks)  |  Blocks modified              | Prefetched data
  * +--------------------+  by to_ds since               |
  *                         ancestor_zb     +------------v----+
  *                                         | Main Thread     |  File Descriptor
  *                                         | Sends data over +->(to zfs receive)
  *                                         | wire            |
  *                                         +-----------------+
  *
  * The second case is an incremental send from a redaction bookmark.  The to_ds
  * traversal thread and the main thread behave the same as in the redacted
  * send case.  The new thread is the from bookmark traversal thread.  It
  * iterates over the redaction list in the redaction bookmark, and enqueues
  * records for each block that was redacted in the original send.  The send
  * merge thread now has to merge the data from the two threads.  For details
  * about that process, see the header comment of send_merge_thread().  Any data
  * it decides to send on will be prefetched by the prefetch thread.  Note that
  * you can perform a redacted send from a redaction bookmark; in that case,
  * the data flow behaves very similarly to the flow in the redacted send case,
  * except with the addition of the bookmark traversal thread iterating over the
  * redaction bookmark.  The send_merge_thread also has to take on the
  * responsibility of merging the redact list thread's records, the bookmark
  * traversal thread's records, and the to_ds records.
  *
  * +---------------------+
  * |                     |
  * | Redact List Thread  +--------------+
  * |                     |              |
  * +---------------------+              |
  *        Blocks in redaction list      | Ranges modified by every secure snap
  *        of from bookmark              | (or EOS if not readcted)
  *                                      |
  * +---------------------+   |     +----v----------------------+
  * | bookmark Traversal  |   v     | Send Merge Thread         |
  * | Thread (finds       +---------> Merges bookmark, rlt, and |
  * | candidate blocks)   |         | to_ds send records        |
  * +---------------------+         +----^---------------+------+
  *                                      |               | Merged data
  *                                      |  +------------v--------+
  *                                      |  | Prefetch Thread     |
  * +--------------------+               |  | Issues prefetch     |
  * | to_ds Traversal    |               |  | reads of data blocks|
  * | Thread (finds      +---------------+  +------------+--------+
  * | candidate blocks)  |  Blocks modified              | Prefetched data
  * +--------------------+  by to_ds since  +------------v----+
  *                         ancestor_zb     | Main Thread     |  File Descriptor
  *                                         | Sends data over +->(to zfs receive)
  *                                         | wire            |
  *                                         +-----------------+
  *
  * The final case is a simple zfs full or incremental send.  The to_ds traversal
  * thread behaves the same as always. The redact list thread is never started.
  * The send merge thread takes all the blocks that the to_ds traversal thread
  * sends it, prefetches the data, and sends the blocks on to the main thread.
  * The main thread sends the data over the wire.
  *
  * To keep performance acceptable, we want to prefetch the data in the worker
  * threads.  While the to_ds thread could simply use the TRAVERSE_PREFETCH
  * feature built into traverse_dataset, the combining and deletion of records
  * due to redaction and sends from redaction bookmarks mean that we could
  * issue many unnecessary prefetches.  As a result, we only prefetch data
  * after we've determined that the record is not going to be redacted.  To
  * prevent the prefetching from getting too far ahead of the main thread, the
  * blocking queues that are used for communication are capped not by the
  * number of entries in the queue, but by the sum of the size of the
  * prefetches associated with them.  The limit on the amount of data that the
  * thread can prefetch beyond what the main thread has reached is controlled
  * by the global variable zfs_send_queue_length.  In addition, to prevent poor
  * performance in the beginning of a send, we also limit the distance ahead
  * that the traversal threads can be.  That distance is controlled by the
  * zfs_send_no_prefetch_queue_length tunable.
  *
  * Note: Releases dp using the specified tag.
  */
 static int
 dmu_send_impl(struct dmu_send_params *dspp)
 {
 	objset_t *os;
 	dmu_replay_record_t *drr;
 	dmu_sendstatus_t *dssp;
 	dmu_send_cookie_t dsc = {0};
 	int err;
 	uint64_t fromtxg = dspp->ancestor_zb.zbm_creation_txg;
 	uint64_t featureflags = 0;
 	struct redact_list_thread_arg *from_arg;
 	struct send_thread_arg *to_arg;
 	struct redact_list_thread_arg *rlt_arg;
 	struct send_merge_thread_arg *smt_arg;
 	struct send_reader_thread_arg *srt_arg;
 	struct send_range *range;
 	redaction_list_t *from_rl = NULL;
 	redaction_list_t *redact_rl = NULL;
 	boolean_t resuming = (dspp->resumeobj != 0 || dspp->resumeoff != 0);
 	boolean_t book_resuming = resuming;
 
 	dsl_dataset_t *to_ds = dspp->to_ds;
 	zfs_bookmark_phys_t *ancestor_zb = &dspp->ancestor_zb;
 	dsl_pool_t *dp = dspp->dp;
 	const void *tag = dspp->tag;
 
 	err = dmu_objset_from_ds(to_ds, &os);
 	if (err != 0) {
 		dsl_pool_rele(dp, tag);
 		return (err);
 	}
 
 	/*
 	 * If this is a non-raw send of an encrypted ds, we can ensure that
 	 * the objset_phys_t is authenticated. This is safe because this is
 	 * either a snapshot or we have owned the dataset, ensuring that
 	 * it can't be modified.
 	 */
 	if (!dspp->rawok && os->os_encrypted &&
 	    arc_is_unauthenticated(os->os_phys_buf)) {
 		zbookmark_phys_t zb;
 
 		SET_BOOKMARK(&zb, to_ds->ds_object, ZB_ROOT_OBJECT,
 		    ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 		err = arc_untransform(os->os_phys_buf, os->os_spa,
 		    &zb, B_FALSE);
 		if (err != 0) {
 			dsl_pool_rele(dp, tag);
 			return (err);
 		}
 
 		ASSERT0(arc_is_unauthenticated(os->os_phys_buf));
 	}
 
 	if ((err = setup_featureflags(dspp, os, &featureflags)) != 0) {
 		dsl_pool_rele(dp, tag);
 		return (err);
 	}
 
 	/*
 	 * If we're doing a redacted send, hold the bookmark's redaction list.
 	 */
 	if (dspp->redactbook != NULL) {
 		err = dsl_redaction_list_hold_obj(dp,
 		    dspp->redactbook->zbm_redaction_obj, FTAG,
 		    &redact_rl);
 		if (err != 0) {
 			dsl_pool_rele(dp, tag);
 			return (SET_ERROR(EINVAL));
 		}
 		dsl_redaction_list_long_hold(dp, redact_rl, FTAG);
 	}
 
 	/*
 	 * If we're sending from a redaction bookmark, hold the redaction list
 	 * so that we can consider sending the redacted blocks.
 	 */
 	if (ancestor_zb->zbm_redaction_obj != 0) {
 		err = dsl_redaction_list_hold_obj(dp,
 		    ancestor_zb->zbm_redaction_obj, FTAG, &from_rl);
 		if (err != 0) {
 			if (redact_rl != NULL) {
 				dsl_redaction_list_long_rele(redact_rl, FTAG);
 				dsl_redaction_list_rele(redact_rl, FTAG);
 			}
 			dsl_pool_rele(dp, tag);
 			return (SET_ERROR(EINVAL));
 		}
 		dsl_redaction_list_long_hold(dp, from_rl, FTAG);
 	}
 
 	dsl_dataset_long_hold(to_ds, FTAG);
 
 	from_arg = kmem_zalloc(sizeof (*from_arg), KM_SLEEP);
 	to_arg = kmem_zalloc(sizeof (*to_arg), KM_SLEEP);
 	rlt_arg = kmem_zalloc(sizeof (*rlt_arg), KM_SLEEP);
 	smt_arg = kmem_zalloc(sizeof (*smt_arg), KM_SLEEP);
 	srt_arg = kmem_zalloc(sizeof (*srt_arg), KM_SLEEP);
 
 	drr = create_begin_record(dspp, os, featureflags);
 	dssp = setup_send_progress(dspp);
 
 	dsc.dsc_drr = drr;
 	dsc.dsc_dso = dspp->dso;
 	dsc.dsc_os = os;
 	dsc.dsc_off = dspp->off;
 	dsc.dsc_toguid = dsl_dataset_phys(to_ds)->ds_guid;
 	dsc.dsc_fromtxg = fromtxg;
 	dsc.dsc_pending_op = PENDING_NONE;
 	dsc.dsc_featureflags = featureflags;
 	dsc.dsc_resume_object = dspp->resumeobj;
 	dsc.dsc_resume_offset = dspp->resumeoff;
 
 	dsl_pool_rele(dp, tag);
 
 	void *payload = NULL;
 	size_t payload_len = 0;
 	nvlist_t *nvl = fnvlist_alloc();
 
 	/*
 	 * If we're doing a redacted send, we include the snapshots we're
 	 * redacted with respect to so that the target system knows what send
 	 * streams can be correctly received on top of this dataset. If we're
 	 * instead sending a redacted dataset, we include the snapshots that the
 	 * dataset was created with respect to.
 	 */
 	if (dspp->redactbook != NULL) {
 		fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_SNAPS,
 		    redact_rl->rl_phys->rlp_snaps,
 		    redact_rl->rl_phys->rlp_num_snaps);
 	} else if (dsl_dataset_feature_is_active(to_ds,
 	    SPA_FEATURE_REDACTED_DATASETS)) {
 		uint64_t *tods_guids;
 		uint64_t length;
 		VERIFY(dsl_dataset_get_uint64_array_feature(to_ds,
 		    SPA_FEATURE_REDACTED_DATASETS, &length, &tods_guids));
 		fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_SNAPS, tods_guids,
 		    length);
 	}
 
 	/*
 	 * If we're sending from a redaction bookmark, then we should retrieve
 	 * the guids of that bookmark so we can send them over the wire.
 	 */
 	if (from_rl != NULL) {
 		fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_FROM_SNAPS,
 		    from_rl->rl_phys->rlp_snaps,
 		    from_rl->rl_phys->rlp_num_snaps);
 	}
 
 	/*
 	 * If the snapshot we're sending from is redacted, include the redaction
 	 * list in the stream.
 	 */
 	if (dspp->numfromredactsnaps != NUM_SNAPS_NOT_REDACTED) {
 		ASSERT3P(from_rl, ==, NULL);
 		fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_FROM_SNAPS,
 		    dspp->fromredactsnaps, (uint_t)dspp->numfromredactsnaps);
 		if (dspp->numfromredactsnaps > 0) {
 			kmem_free(dspp->fromredactsnaps,
 			    dspp->numfromredactsnaps * sizeof (uint64_t));
 			dspp->fromredactsnaps = NULL;
 		}
 	}
 
 	if (resuming || book_resuming) {
 		err = setup_resume_points(dspp, to_arg, from_arg,
 		    rlt_arg, smt_arg, resuming, os, redact_rl, nvl);
 		if (err != 0)
 			goto out;
 	}
 
 	if (featureflags & DMU_BACKUP_FEATURE_RAW) {
 		uint64_t ivset_guid = ancestor_zb->zbm_ivset_guid;
 		nvlist_t *keynvl = NULL;
 		ASSERT(os->os_encrypted);
 
 		err = dsl_crypto_populate_key_nvlist(os, ivset_guid,
 		    &keynvl);
 		if (err != 0) {
 			fnvlist_free(nvl);
 			goto out;
 		}
 
 		fnvlist_add_nvlist(nvl, "crypt_keydata", keynvl);
 		fnvlist_free(keynvl);
 	}
 
 	if (!nvlist_empty(nvl)) {
 		payload = fnvlist_pack(nvl, &payload_len);
 		drr->drr_payloadlen = payload_len;
 	}
 
 	fnvlist_free(nvl);
 	err = dump_record(&dsc, payload, payload_len);
 	fnvlist_pack_free(payload, payload_len);
 	if (err != 0) {
 		err = dsc.dsc_err;
 		goto out;
 	}
 
 	setup_to_thread(to_arg, os, dssp, fromtxg, dspp->rawok);
 	setup_from_thread(from_arg, from_rl, dssp);
 	setup_redact_list_thread(rlt_arg, dspp, redact_rl, dssp);
 	setup_merge_thread(smt_arg, dspp, from_arg, to_arg, rlt_arg, os);
 	setup_reader_thread(srt_arg, dspp, smt_arg, featureflags);
 
 	range = bqueue_dequeue(&srt_arg->q);
 	while (err == 0 && !range->eos_marker) {
 		err = do_dump(&dsc, range);
 		range = get_next_range(&srt_arg->q, range);
 		if (issig(JUSTLOOKING) && issig(FORREAL))
 			err = SET_ERROR(EINTR);
 	}
 
 	/*
 	 * If we hit an error or are interrupted, cancel our worker threads and
 	 * clear the queue of any pending records.  The threads will pass the
 	 * cancel up the tree of worker threads, and each one will clean up any
 	 * pending records before exiting.
 	 */
 	if (err != 0) {
 		srt_arg->cancel = B_TRUE;
 		while (!range->eos_marker) {
 			range = get_next_range(&srt_arg->q, range);
 		}
 	}
 	range_free(range);
 
 	bqueue_destroy(&srt_arg->q);
 	bqueue_destroy(&smt_arg->q);
 	if (dspp->redactbook != NULL)
 		bqueue_destroy(&rlt_arg->q);
 	bqueue_destroy(&to_arg->q);
 	bqueue_destroy(&from_arg->q);
 
 	if (err == 0 && srt_arg->error != 0)
 		err = srt_arg->error;
 
 	if (err != 0)
 		goto out;
 
 	if (dsc.dsc_pending_op != PENDING_NONE)
 		if (dump_record(&dsc, NULL, 0) != 0)
 			err = SET_ERROR(EINTR);
 
 	if (err != 0) {
 		if (err == EINTR && dsc.dsc_err != 0)
 			err = dsc.dsc_err;
 		goto out;
 	}
 
 	/*
 	 * Send the DRR_END record if this is not a saved stream.
 	 * Otherwise, the omitted DRR_END record will signal to
 	 * the receive side that the stream is incomplete.
 	 */
 	if (!dspp->savedok) {
 		memset(drr, 0, sizeof (dmu_replay_record_t));
 		drr->drr_type = DRR_END;
 		drr->drr_u.drr_end.drr_checksum = dsc.dsc_zc;
 		drr->drr_u.drr_end.drr_toguid = dsc.dsc_toguid;
 
 		if (dump_record(&dsc, NULL, 0) != 0)
 			err = dsc.dsc_err;
 	}
 out:
 	mutex_enter(&to_ds->ds_sendstream_lock);
 	list_remove(&to_ds->ds_sendstreams, dssp);
 	mutex_exit(&to_ds->ds_sendstream_lock);
 
 	VERIFY(err != 0 || (dsc.dsc_sent_begin &&
 	    (dsc.dsc_sent_end || dspp->savedok)));
 
 	kmem_free(drr, sizeof (dmu_replay_record_t));
 	kmem_free(dssp, sizeof (dmu_sendstatus_t));
 	kmem_free(from_arg, sizeof (*from_arg));
 	kmem_free(to_arg, sizeof (*to_arg));
 	kmem_free(rlt_arg, sizeof (*rlt_arg));
 	kmem_free(smt_arg, sizeof (*smt_arg));
 	kmem_free(srt_arg, sizeof (*srt_arg));
 
 	dsl_dataset_long_rele(to_ds, FTAG);
 	if (from_rl != NULL) {
 		dsl_redaction_list_long_rele(from_rl, FTAG);
 		dsl_redaction_list_rele(from_rl, FTAG);
 	}
 	if (redact_rl != NULL) {
 		dsl_redaction_list_long_rele(redact_rl, FTAG);
 		dsl_redaction_list_rele(redact_rl, FTAG);
 	}
 
 	return (err);
 }
 
 int
 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
     boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
     boolean_t rawok, boolean_t savedok, int outfd, offset_t *off,
     dmu_send_outparams_t *dsop)
 {
 	int err;
 	dsl_dataset_t *fromds;
 	ds_hold_flags_t dsflags;
 	struct dmu_send_params dspp = {0};
 	dspp.embedok = embedok;
 	dspp.large_block_ok = large_block_ok;
 	dspp.compressok = compressok;
 	dspp.outfd = outfd;
 	dspp.off = off;
 	dspp.dso = dsop;
 	dspp.tag = FTAG;
 	dspp.rawok = rawok;
 	dspp.savedok = savedok;
 
 	dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
 	err = dsl_pool_hold(pool, FTAG, &dspp.dp);
 	if (err != 0)
 		return (err);
 
 	err = dsl_dataset_hold_obj_flags(dspp.dp, tosnap, dsflags, FTAG,
 	    &dspp.to_ds);
 	if (err != 0) {
 		dsl_pool_rele(dspp.dp, FTAG);
 		return (err);
 	}
 
 	if (fromsnap != 0) {
 		err = dsl_dataset_hold_obj_flags(dspp.dp, fromsnap, dsflags,
 		    FTAG, &fromds);
 		if (err != 0) {
 			dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG);
 			dsl_pool_rele(dspp.dp, FTAG);
 			return (err);
 		}
 		dspp.ancestor_zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
 		dspp.ancestor_zb.zbm_creation_txg =
 		    dsl_dataset_phys(fromds)->ds_creation_txg;
 		dspp.ancestor_zb.zbm_creation_time =
 		    dsl_dataset_phys(fromds)->ds_creation_time;
 
 		if (dsl_dataset_is_zapified(fromds)) {
 			(void) zap_lookup(dspp.dp->dp_meta_objset,
 			    fromds->ds_object, DS_FIELD_IVSET_GUID, 8, 1,
 			    &dspp.ancestor_zb.zbm_ivset_guid);
 		}
 
 		/* See dmu_send for the reasons behind this. */
 		uint64_t *fromredact;
 
 		if (!dsl_dataset_get_uint64_array_feature(fromds,
 		    SPA_FEATURE_REDACTED_DATASETS,
 		    &dspp.numfromredactsnaps,
 		    &fromredact)) {
 			dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED;
 		} else if (dspp.numfromredactsnaps > 0) {
 			uint64_t size = dspp.numfromredactsnaps *
 			    sizeof (uint64_t);
 			dspp.fromredactsnaps = kmem_zalloc(size, KM_SLEEP);
 			memcpy(dspp.fromredactsnaps, fromredact, size);
 		}
 
 		boolean_t is_before =
 		    dsl_dataset_is_before(dspp.to_ds, fromds, 0);
 		dspp.is_clone = (dspp.to_ds->ds_dir !=
 		    fromds->ds_dir);
 		dsl_dataset_rele(fromds, FTAG);
 		if (!is_before) {
 			dsl_pool_rele(dspp.dp, FTAG);
 			err = SET_ERROR(EXDEV);
 		} else {
 			err = dmu_send_impl(&dspp);
 		}
 	} else {
 		dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED;
 		err = dmu_send_impl(&dspp);
 	}
 	if (dspp.fromredactsnaps)
 		kmem_free(dspp.fromredactsnaps,
 		    dspp.numfromredactsnaps * sizeof (uint64_t));
 
 	dsl_dataset_rele(dspp.to_ds, FTAG);
 	return (err);
 }
 
 int
 dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
     boolean_t large_block_ok, boolean_t compressok, boolean_t rawok,
     boolean_t savedok, uint64_t resumeobj, uint64_t resumeoff,
     const char *redactbook, int outfd, offset_t *off,
     dmu_send_outparams_t *dsop)
 {
 	int err = 0;
 	ds_hold_flags_t dsflags;
 	boolean_t owned = B_FALSE;
 	dsl_dataset_t *fromds = NULL;
 	zfs_bookmark_phys_t book = {0};
 	struct dmu_send_params dspp = {0};
 
 	dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
 	dspp.tosnap = tosnap;
 	dspp.embedok = embedok;
 	dspp.large_block_ok = large_block_ok;
 	dspp.compressok = compressok;
 	dspp.outfd = outfd;
 	dspp.off = off;
 	dspp.dso = dsop;
 	dspp.tag = FTAG;
 	dspp.resumeobj = resumeobj;
 	dspp.resumeoff = resumeoff;
 	dspp.rawok = rawok;
 	dspp.savedok = savedok;
 
 	if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
 		return (SET_ERROR(EINVAL));
 
 	err = dsl_pool_hold(tosnap, FTAG, &dspp.dp);
 	if (err != 0)
 		return (err);
 
 	if (strchr(tosnap, '@') == NULL && spa_writeable(dspp.dp->dp_spa)) {
 		/*
 		 * We are sending a filesystem or volume.  Ensure
 		 * that it doesn't change by owning the dataset.
 		 */
 
 		if (savedok) {
 			/*
 			 * We are looking for the dataset that represents the
 			 * partially received send stream. If this stream was
 			 * received as a new snapshot of an existing dataset,
 			 * this will be saved in a hidden clone named
 			 * "<pool>/<dataset>/%recv". Otherwise, the stream
 			 * will be saved in the live dataset itself. In
 			 * either case we need to use dsl_dataset_own_force()
 			 * because the stream is marked as inconsistent,
 			 * which would normally make it unavailable to be
 			 * owned.
 			 */
 			char *name = kmem_asprintf("%s/%s", tosnap,
 			    recv_clone_name);
 			err = dsl_dataset_own_force(dspp.dp, name, dsflags,
 			    FTAG, &dspp.to_ds);
 			if (err == ENOENT) {
 				err = dsl_dataset_own_force(dspp.dp, tosnap,
 				    dsflags, FTAG, &dspp.to_ds);
 			}
 
 			if (err == 0) {
 				owned = B_TRUE;
 				err = zap_lookup(dspp.dp->dp_meta_objset,
 				    dspp.to_ds->ds_object,
 				    DS_FIELD_RESUME_TOGUID, 8, 1,
 				    &dspp.saved_guid);
 			}
 
 			if (err == 0) {
 				err = zap_lookup(dspp.dp->dp_meta_objset,
 				    dspp.to_ds->ds_object,
 				    DS_FIELD_RESUME_TONAME, 1,
 				    sizeof (dspp.saved_toname),
 				    dspp.saved_toname);
 			}
 			/* Only disown if there was an error in the lookups */
 			if (owned && (err != 0))
 				dsl_dataset_disown(dspp.to_ds, dsflags, FTAG);
 
 			kmem_strfree(name);
 		} else {
 			err = dsl_dataset_own(dspp.dp, tosnap, dsflags,
 			    FTAG, &dspp.to_ds);
 			if (err == 0)
 				owned = B_TRUE;
 		}
 	} else {
 		err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG,
 		    &dspp.to_ds);
 	}
 
 	if (err != 0) {
 		/* Note: dsl dataset is not owned at this point */
 		dsl_pool_rele(dspp.dp, FTAG);
 		return (err);
 	}
 
 	if (redactbook != NULL) {
 		char path[ZFS_MAX_DATASET_NAME_LEN];
 		(void) strlcpy(path, tosnap, sizeof (path));
 		char *at = strchr(path, '@');
 		if (at == NULL) {
 			err = EINVAL;
 		} else {
 			(void) snprintf(at, sizeof (path) - (at - path), "#%s",
 			    redactbook);
 			err = dsl_bookmark_lookup(dspp.dp, path,
 			    NULL, &book);
 			dspp.redactbook = &book;
 		}
 	}
 
 	if (err != 0) {
 		dsl_pool_rele(dspp.dp, FTAG);
 		if (owned)
 			dsl_dataset_disown(dspp.to_ds, dsflags, FTAG);
 		else
 			dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG);
 		return (err);
 	}
 
 	if (fromsnap != NULL) {
 		zfs_bookmark_phys_t *zb = &dspp.ancestor_zb;
 		int fsnamelen;
 		if (strpbrk(tosnap, "@#") != NULL)
 			fsnamelen = strpbrk(tosnap, "@#") - tosnap;
 		else
 			fsnamelen = strlen(tosnap);
 
 		/*
 		 * If the fromsnap is in a different filesystem, then
 		 * mark the send stream as a clone.
 		 */
 		if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
 		    (fromsnap[fsnamelen] != '@' &&
 		    fromsnap[fsnamelen] != '#')) {
 			dspp.is_clone = B_TRUE;
 		}
 
 		if (strchr(fromsnap, '@') != NULL) {
 			err = dsl_dataset_hold(dspp.dp, fromsnap, FTAG,
 			    &fromds);
 
 			if (err != 0) {
 				ASSERT3P(fromds, ==, NULL);
 			} else {
 				/*
 				 * We need to make a deep copy of the redact
 				 * snapshots of the from snapshot, because the
 				 * array will be freed when we evict from_ds.
 				 */
 				uint64_t *fromredact;
 				if (!dsl_dataset_get_uint64_array_feature(
 				    fromds, SPA_FEATURE_REDACTED_DATASETS,
 				    &dspp.numfromredactsnaps,
 				    &fromredact)) {
 					dspp.numfromredactsnaps =
 					    NUM_SNAPS_NOT_REDACTED;
 				} else if (dspp.numfromredactsnaps > 0) {
 					uint64_t size =
 					    dspp.numfromredactsnaps *
 					    sizeof (uint64_t);
 					dspp.fromredactsnaps = kmem_zalloc(size,
 					    KM_SLEEP);
 					memcpy(dspp.fromredactsnaps, fromredact,
 					    size);
 				}
 				if (!dsl_dataset_is_before(dspp.to_ds, fromds,
 				    0)) {
 					err = SET_ERROR(EXDEV);
 				} else {
 					zb->zbm_creation_txg =
 					    dsl_dataset_phys(fromds)->
 					    ds_creation_txg;
 					zb->zbm_creation_time =
 					    dsl_dataset_phys(fromds)->
 					    ds_creation_time;
 					zb->zbm_guid =
 					    dsl_dataset_phys(fromds)->ds_guid;
 					zb->zbm_redaction_obj = 0;
 
 					if (dsl_dataset_is_zapified(fromds)) {
 						(void) zap_lookup(
 						    dspp.dp->dp_meta_objset,
 						    fromds->ds_object,
 						    DS_FIELD_IVSET_GUID, 8, 1,
 						    &zb->zbm_ivset_guid);
 					}
 				}
 				dsl_dataset_rele(fromds, FTAG);
 			}
 		} else {
 			dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED;
 			err = dsl_bookmark_lookup(dspp.dp, fromsnap, dspp.to_ds,
 			    zb);
 			if (err == EXDEV && zb->zbm_redaction_obj != 0 &&
 			    zb->zbm_guid ==
 			    dsl_dataset_phys(dspp.to_ds)->ds_guid)
 				err = 0;
 		}
 
 		if (err == 0) {
 			/* dmu_send_impl will call dsl_pool_rele for us. */
 			err = dmu_send_impl(&dspp);
 		} else {
 			if (dspp.fromredactsnaps)
 				kmem_free(dspp.fromredactsnaps,
 				    dspp.numfromredactsnaps *
 				    sizeof (uint64_t));
 			dsl_pool_rele(dspp.dp, FTAG);
 		}
 	} else {
 		dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED;
 		err = dmu_send_impl(&dspp);
 	}
 	if (owned)
 		dsl_dataset_disown(dspp.to_ds, dsflags, FTAG);
 	else
 		dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG);
 	return (err);
 }
 
 static int
 dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed,
     uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep)
 {
 	int err = 0;
 	uint64_t size;
 	/*
 	 * Assume that space (both on-disk and in-stream) is dominated by
 	 * data.  We will adjust for indirect blocks and the copies property,
 	 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
 	 */
 
 	uint64_t recordsize;
 	uint64_t record_count;
 	objset_t *os;
 	VERIFY0(dmu_objset_from_ds(ds, &os));
 
 	/* Assume all (uncompressed) blocks are recordsize. */
 	if (zfs_override_estimate_recordsize != 0) {
 		recordsize = zfs_override_estimate_recordsize;
 	} else if (os->os_phys->os_type == DMU_OST_ZVOL) {
 		err = dsl_prop_get_int_ds(ds,
 		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize);
 	} else {
 		err = dsl_prop_get_int_ds(ds,
 		    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize);
 	}
 	if (err != 0)
 		return (err);
 	record_count = uncompressed / recordsize;
 
 	/*
 	 * If we're estimating a send size for a compressed stream, use the
 	 * compressed data size to estimate the stream size. Otherwise, use the
 	 * uncompressed data size.
 	 */
 	size = stream_compressed ? compressed : uncompressed;
 
 	/*
 	 * Subtract out approximate space used by indirect blocks.
 	 * Assume most space is used by data blocks (non-indirect, non-dnode).
 	 * Assume no ditto blocks or internal fragmentation.
 	 *
 	 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
 	 * block.
 	 */
 	size -= record_count * sizeof (blkptr_t);
 
 	/* Add in the space for the record associated with each block. */
 	size += record_count * sizeof (dmu_replay_record_t);
 
 	*sizep = size;
 
 	return (0);
 }
 
 int
 dmu_send_estimate_fast(dsl_dataset_t *origds, dsl_dataset_t *fromds,
     zfs_bookmark_phys_t *frombook, boolean_t stream_compressed,
     boolean_t saved, uint64_t *sizep)
 {
 	int err;
 	dsl_dataset_t *ds = origds;
 	uint64_t uncomp, comp;
 
 	ASSERT(dsl_pool_config_held(origds->ds_dir->dd_pool));
 	ASSERT(fromds == NULL || frombook == NULL);
 
 	/*
 	 * If this is a saved send we may actually be sending
 	 * from the %recv clone used for resuming.
 	 */
 	if (saved) {
 		objset_t *mos = origds->ds_dir->dd_pool->dp_meta_objset;
 		uint64_t guid;
 		char dsname[ZFS_MAX_DATASET_NAME_LEN + 6];
 
 		dsl_dataset_name(origds, dsname);
 		(void) strcat(dsname, "/");
 		(void) strlcat(dsname, recv_clone_name, sizeof (dsname));
 
 		err = dsl_dataset_hold(origds->ds_dir->dd_pool,
 		    dsname, FTAG, &ds);
 		if (err != ENOENT && err != 0) {
 			return (err);
 		} else if (err == ENOENT) {
 			ds = origds;
 		}
 
 		/* check that this dataset has partially received data */
 		err = zap_lookup(mos, ds->ds_object,
 		    DS_FIELD_RESUME_TOGUID, 8, 1, &guid);
 		if (err != 0) {
 			err = SET_ERROR(err == ENOENT ? EINVAL : err);
 			goto out;
 		}
 
 		err = zap_lookup(mos, ds->ds_object,
 		    DS_FIELD_RESUME_TONAME, 1, sizeof (dsname), dsname);
 		if (err != 0) {
 			err = SET_ERROR(err == ENOENT ? EINVAL : err);
 			goto out;
 		}
 	}
 
 	/* tosnap must be a snapshot or the target of a saved send */
 	if (!ds->ds_is_snapshot && ds == origds)
 		return (SET_ERROR(EINVAL));
 
 	if (fromds != NULL) {
 		uint64_t used;
 		if (!fromds->ds_is_snapshot) {
 			err = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		if (!dsl_dataset_is_before(ds, fromds, 0)) {
 			err = SET_ERROR(EXDEV);
 			goto out;
 		}
 
 		err = dsl_dataset_space_written(fromds, ds, &used, &comp,
 		    &uncomp);
 		if (err != 0)
 			goto out;
 	} else if (frombook != NULL) {
 		uint64_t used;
 		err = dsl_dataset_space_written_bookmark(frombook, ds, &used,
 		    &comp, &uncomp);
 		if (err != 0)
 			goto out;
 	} else {
 		uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
 		comp = dsl_dataset_phys(ds)->ds_compressed_bytes;
 	}
 
 	err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp,
 	    stream_compressed, sizep);
 	/*
 	 * Add the size of the BEGIN and END records to the estimate.
 	 */
 	*sizep += 2 * sizeof (dmu_replay_record_t);
 
 out:
 	if (ds != origds)
 		dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
 ZFS_MODULE_PARAM(zfs_send, zfs_send_, corrupt_data, INT, ZMOD_RW,
 	"Allow sending corrupt data");
 
 ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_length, UINT, ZMOD_RW,
 	"Maximum send queue length");
 
 ZFS_MODULE_PARAM(zfs_send, zfs_send_, unmodified_spill_blocks, INT, ZMOD_RW,
 	"Send unmodified spill blocks");
 
 ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_length, UINT, ZMOD_RW,
 	"Maximum send queue length for non-prefetch queues");
 
 ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_ff, UINT, ZMOD_RW,
 	"Send queue fill fraction");
 
 ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_ff, UINT, ZMOD_RW,
 	"Send queue fill fraction for non-prefetch queues");
 
 ZFS_MODULE_PARAM(zfs_send, zfs_, override_estimate_recordsize, UINT, ZMOD_RW,
 	"Override block size estimate with fixed size");
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index 809f7f6165f9..15cc2885e805 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -1,822 +1,823 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dnode.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/dmu_impl.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/callb.h>
 #include <sys/zfeature.h>
 
 static int32_t zfs_pd_bytes_max = 50 * 1024 * 1024;	/* 50MB */
 static int32_t send_holes_without_birth_time = 1;
 static uint_t zfs_traverse_indirect_prefetch_limit = 32;
 
 typedef struct prefetch_data {
 	kmutex_t pd_mtx;
 	kcondvar_t pd_cv;
 	int32_t pd_bytes_fetched;
 	int pd_flags;
 	boolean_t pd_cancel;
 	boolean_t pd_exited;
 	zbookmark_phys_t pd_resume;
 } prefetch_data_t;
 
 typedef struct traverse_data {
 	spa_t *td_spa;
 	uint64_t td_objset;
 	blkptr_t *td_rootbp;
 	uint64_t td_min_txg;
 	zbookmark_phys_t *td_resume;
 	int td_flags;
 	prefetch_data_t *td_pfd;
 	boolean_t td_paused;
 	uint64_t td_hole_birth_enabled_txg;
 	blkptr_cb_t *td_func;
 	void *td_arg;
 	boolean_t td_realloc_possible;
 } traverse_data_t;
 
 static int traverse_dnode(traverse_data_t *td, const blkptr_t *bp,
     const dnode_phys_t *dnp, uint64_t objset, uint64_t object);
 static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
     uint64_t objset, uint64_t object);
 
 static int
 traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
     uint64_t claim_txg)
 {
 	traverse_data_t *td = arg;
 	zbookmark_phys_t zb;
 
 	if (BP_IS_HOLE(bp))
 		return (0);
 
-	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
+	if (claim_txg == 0 &&
+	    BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(td->td_spa))
 		return (-1);
 
 	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
 
 	return (0);
 }
 
 static int
 traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
     uint64_t claim_txg)
 {
 	traverse_data_t *td = arg;
 
 	if (lrc->lrc_txtype == TX_WRITE) {
 		lr_write_t *lr = (lr_write_t *)lrc;
 		blkptr_t *bp = &lr->lr_blkptr;
 		zbookmark_phys_t zb;
 
 		if (BP_IS_HOLE(bp))
 			return (0);
 
-		if (claim_txg == 0 || bp->blk_birth < claim_txg)
+		if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
 			return (0);
 
 		ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
 		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
 		    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
 		(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
 		    td->td_arg);
 	}
 	return (0);
 }
 
 static void
 traverse_zil(traverse_data_t *td, zil_header_t *zh)
 {
 	uint64_t claim_txg = zh->zh_claim_txg;
 
 	/*
 	 * We only want to visit blocks that have been claimed but not yet
 	 * replayed; plus blocks that are already stable in read-only mode.
 	 */
 	if (claim_txg == 0 && spa_writeable(td->td_spa))
 		return;
 
 	zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
 	    claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT));
 	zil_free(zilog);
 }
 
 typedef enum resume_skip {
 	RESUME_SKIP_ALL,
 	RESUME_SKIP_NONE,
 	RESUME_SKIP_CHILDREN
 } resume_skip_t;
 
 /*
  * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
  * the block indicated by zb does not need to be visited at all. Returns
  * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
  * resume point. This indicates that this block should be visited but not its
  * children (since they must have been visited in a previous traversal).
  * Otherwise returns RESUME_SKIP_NONE.
  */
 static resume_skip_t
 resume_skip_check(const traverse_data_t *td, const dnode_phys_t *dnp,
     const zbookmark_phys_t *zb)
 {
 	if (td->td_resume != NULL) {
 		/*
 		 * If we already visited this bp & everything below,
 		 * don't bother doing it again.
 		 */
 		if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
 			return (RESUME_SKIP_ALL);
 
 		if (memcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
 			if (td->td_flags & TRAVERSE_POST)
 				return (RESUME_SKIP_CHILDREN);
 		}
 	}
 	return (RESUME_SKIP_NONE);
 }
 
 /*
  * Returns B_TRUE, if prefetch read is issued, otherwise B_FALSE.
  */
 static boolean_t
 traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
 	    ARC_FLAG_PRESCIENT_PREFETCH;
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
 
 	if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
 		return (B_FALSE);
 	/*
 	 * If this bp is before the resume point, it may have already been
 	 * freed.
 	 */
 	if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE)
 		return (B_FALSE);
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
+	if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg)
 		return (B_FALSE);
 	if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
 		return (B_FALSE);
 	ASSERT(!BP_IS_REDACTED(bp));
 
 	if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
 		zio_flags |= ZIO_FLAG_RAW;
 
 	(void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 	return (B_TRUE);
 }
 
 static boolean_t
 prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
 {
 	ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
 	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG || BP_IS_REDACTED(bp))
 		return (B_FALSE);
 	return (B_TRUE);
 }
 
 static int
 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	int err = 0;
 	arc_buf_t *buf = NULL;
 	prefetch_data_t *pd = td->td_pfd;
 
 	switch (resume_skip_check(td, dnp, zb)) {
 	case RESUME_SKIP_ALL:
 		return (0);
 	case RESUME_SKIP_CHILDREN:
 		goto post;
 	case RESUME_SKIP_NONE:
 		break;
 	default:
 		ASSERT(0);
 	}
 
-	if (bp->blk_birth == 0) {
+	if (BP_GET_LOGICAL_BIRTH(bp) == 0) {
 		/*
 		 * Since this block has a birth time of 0 it must be one of
 		 * two things: a hole created before the
 		 * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
 		 * which has always been a hole in an object.
 		 *
 		 * If a file is written sparsely, then the unwritten parts of
 		 * the file were "always holes" -- that is, they have been
 		 * holes since this object was allocated.  However, we (and
 		 * our callers) can not necessarily tell when an object was
 		 * allocated.  Therefore, if it's possible that this object
 		 * was freed and then its object number reused, we need to
 		 * visit all the holes with birth==0.
 		 *
 		 * If it isn't possible that the object number was reused,
 		 * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
 		 * all the blocks we will visit as part of this traversal,
 		 * then this hole must have always existed, so we can skip
 		 * it.  We visit blocks born after (exclusive) td_min_txg.
 		 *
 		 * Note that the meta-dnode cannot be reallocated.
 		 */
 		if (!send_holes_without_birth_time &&
 		    (!td->td_realloc_possible ||
 		    zb->zb_object == DMU_META_DNODE_OBJECT) &&
 		    td->td_hole_birth_enabled_txg <= td->td_min_txg)
 			return (0);
-	} else if (bp->blk_birth <= td->td_min_txg) {
+	} else if (BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) {
 		return (0);
 	}
 
 	if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
 		uint64_t size = BP_GET_LSIZE(bp);
 		mutex_enter(&pd->pd_mtx);
 		ASSERT(pd->pd_bytes_fetched >= 0);
 		while (pd->pd_bytes_fetched < size && !pd->pd_exited)
 			cv_wait_sig(&pd->pd_cv, &pd->pd_mtx);
 		pd->pd_bytes_fetched -= size;
 		cv_broadcast(&pd->pd_cv);
 		mutex_exit(&pd->pd_mtx);
 	}
 
 	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 		if (err != 0)
 			goto post;
 		return (0);
 	}
 
 	if (td->td_flags & TRAVERSE_PRE) {
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
 		    td->td_arg);
 		if (err == TRAVERSE_VISIT_NO_CHILDREN)
 			return (0);
 		if (err != 0)
 			goto post;
 	}
 
 	if (BP_GET_LEVEL(bp) > 0) {
 		uint32_t flags = ARC_FLAG_WAIT;
 		int32_t i, ptidx, pidx;
 		uint32_t prefetchlimit;
 		int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		zbookmark_phys_t *czb;
 
 		ASSERT(!BP_IS_PROTECTED(bp));
 
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
 			goto post;
 
 		czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP);
 
 		/*
 		 * When performing a traversal it is beneficial to
 		 * asynchronously read-ahead the upcoming indirect
 		 * blocks since they will be needed shortly. However,
 		 * since a 128k indirect (non-L0) block may contain up
 		 * to 1024 128-byte block pointers, its preferable to not
 		 * prefetch them all at once. Issuing a large number of
 		 * async reads may effect performance, and the earlier
 		 * the indirect blocks are prefetched the less likely
 		 * they are to still be resident in the ARC when needed.
 		 * Therefore, prefetching indirect blocks is limited to
 		 * zfs_traverse_indirect_prefetch_limit=32 blocks by
 		 * default.
 		 *
 		 * pidx: Index for which next prefetch to be issued.
 		 * ptidx: Index at which next prefetch to be triggered.
 		 */
 		ptidx = 0;
 		pidx = 1;
 		prefetchlimit = zfs_traverse_indirect_prefetch_limit;
 		for (i = 0; i < epb; i++) {
 			if (prefetchlimit && i == ptidx) {
 				ASSERT3S(ptidx, <=, pidx);
 				for (uint32_t  prefetched = 0; pidx < epb &&
 				    prefetched < prefetchlimit; pidx++) {
 					SET_BOOKMARK(czb, zb->zb_objset,
 					    zb->zb_object, zb->zb_level - 1,
 					    zb->zb_blkid * epb + pidx);
 					if (traverse_prefetch_metadata(td, dnp,
 					    &((blkptr_t *)buf->b_data)[pidx],
 					    czb) == B_TRUE) {
 						prefetched++;
 						if (prefetched ==
 						    MAX(prefetchlimit / 2, 1))
 							ptidx = pidx;
 					}
 				}
 			}
 
 			/* recursively visitbp() blocks below this */
 			SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			err = traverse_visitbp(td, dnp,
 			    &((blkptr_t *)buf->b_data)[i], czb);
 			if (err != 0)
 				break;
 		}
 
 		kmem_free(czb, sizeof (zbookmark_phys_t));
 
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		uint32_t flags = ARC_FLAG_WAIT;
 		uint32_t zio_flags = ZIO_FLAG_CANFAIL;
 		int32_t i;
 		int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 		dnode_phys_t *child_dnp;
 
 		/*
 		 * dnode blocks might have their bonus buffers encrypted, so
 		 * we must be careful to honor TRAVERSE_NO_DECRYPT
 		 */
 		if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
 			zio_flags |= ZIO_FLAG_RAW;
 
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err != 0)
 			goto post;
 
 		child_dnp = buf->b_data;
 
 		for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
 			prefetch_dnode_metadata(td, &child_dnp[i],
 			    zb->zb_objset, zb->zb_blkid * epb + i);
 		}
 
 		/* recursively visitbp() blocks below this */
 		for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
 			err = traverse_dnode(td, bp, &child_dnp[i],
 			    zb->zb_objset, zb->zb_blkid * epb + i);
 			if (err != 0)
 				break;
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		uint32_t zio_flags = ZIO_FLAG_CANFAIL;
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		objset_phys_t *osp;
 
 		if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
 			zio_flags |= ZIO_FLAG_RAW;
 
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err != 0)
 			goto post;
 
 		osp = buf->b_data;
 		prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
 		    DMU_META_DNODE_OBJECT);
 		/*
 		 * See the block comment above for the goal of this variable.
 		 * If the maxblkid of the meta-dnode is 0, then we know that
 		 * we've never had more than DNODES_PER_BLOCK objects in the
 		 * dataset, which means we can't have reused any object ids.
 		 */
 		if (osp->os_meta_dnode.dn_maxblkid == 0)
 			td->td_realloc_possible = B_FALSE;
 
 		if (OBJSET_BUF_HAS_USERUSED(buf)) {
 			if (OBJSET_BUF_HAS_PROJECTUSED(buf))
 				prefetch_dnode_metadata(td,
 				    &osp->os_projectused_dnode,
 				    zb->zb_objset, DMU_PROJECTUSED_OBJECT);
 			prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
 			    zb->zb_objset, DMU_GROUPUSED_OBJECT);
 			prefetch_dnode_metadata(td, &osp->os_userused_dnode,
 			    zb->zb_objset, DMU_USERUSED_OBJECT);
 		}
 
 		err = traverse_dnode(td, bp, &osp->os_meta_dnode, zb->zb_objset,
 		    DMU_META_DNODE_OBJECT);
 		if (err == 0 && OBJSET_BUF_HAS_USERUSED(buf)) {
 			if (OBJSET_BUF_HAS_PROJECTUSED(buf))
 				err = traverse_dnode(td, bp,
 				    &osp->os_projectused_dnode, zb->zb_objset,
 				    DMU_PROJECTUSED_OBJECT);
 			if (err == 0)
 				err = traverse_dnode(td, bp,
 				    &osp->os_groupused_dnode, zb->zb_objset,
 				    DMU_GROUPUSED_OBJECT);
 			if (err == 0)
 				err = traverse_dnode(td, bp,
 				    &osp->os_userused_dnode, zb->zb_objset,
 				    DMU_USERUSED_OBJECT);
 		}
 	}
 
 	if (buf)
 		arc_buf_destroy(buf, &buf);
 
 post:
 	if (err == 0 && (td->td_flags & TRAVERSE_POST))
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 
 	if ((td->td_flags & TRAVERSE_HARD) && (err == EIO || err == ECKSUM)) {
 		/*
 		 * Ignore this disk error as requested by the HARD flag,
 		 * and continue traversal.
 		 */
 		err = 0;
 	}
 
 	/*
 	 * If we are stopping here, set td_resume.
 	 */
 	if (td->td_resume != NULL && err != 0 && !td->td_paused) {
 		td->td_resume->zb_objset = zb->zb_objset;
 		td->td_resume->zb_object = zb->zb_object;
 		td->td_resume->zb_level = 0;
 		/*
 		 * If we have stopped on an indirect block (e.g. due to
 		 * i/o error), we have not visited anything below it.
 		 * Set the bookmark to the first level-0 block that we need
 		 * to visit.  This way, the resuming code does not need to
 		 * deal with resuming from indirect blocks.
 		 *
 		 * Note, if zb_level <= 0, dnp may be NULL, so we don't want
 		 * to dereference it.
 		 */
 		td->td_resume->zb_blkid = zb->zb_blkid;
 		if (zb->zb_level > 0) {
 			td->td_resume->zb_blkid <<= zb->zb_level *
 			    (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
 		}
 		td->td_paused = B_TRUE;
 	}
 
 	return (err);
 }
 
 static void
 prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
     uint64_t objset, uint64_t object)
 {
 	int j;
 	zbookmark_phys_t czb;
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 		traverse_prefetch_metadata(td, dnp, &dnp->dn_blkptr[j], &czb);
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 		traverse_prefetch_metadata(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
 	}
 }
 
 static int
 traverse_dnode(traverse_data_t *td, const blkptr_t *bp, const dnode_phys_t *dnp,
     uint64_t objset, uint64_t object)
 {
 	int j, err = 0;
 	zbookmark_phys_t czb;
 
 	if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
 	    object < td->td_resume->zb_object)
 		return (0);
 
 	if (td->td_flags & TRAVERSE_PRE) {
 		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
 		    ZB_DNODE_BLKID);
 		err = td->td_func(td->td_spa, NULL, bp, &czb, dnp,
 		    td->td_arg);
 		if (err == TRAVERSE_VISIT_NO_CHILDREN)
 			return (0);
 		if (err != 0)
 			return (err);
 	}
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
 		if (err != 0)
 			break;
 	}
 
 	if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
 		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 		err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
 	}
 
 	if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
 		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
 		    ZB_DNODE_BLKID);
 		err = td->td_func(td->td_spa, NULL, bp, &czb, dnp,
 		    td->td_arg);
 		if (err == TRAVERSE_VISIT_NO_CHILDREN)
 			return (0);
 		if (err != 0)
 			return (err);
 	}
 	return (err);
 }
 
 static int
 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	(void) zilog, (void) dnp;
 	prefetch_data_t *pfd = arg;
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
 	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
 	    ARC_FLAG_PRESCIENT_PREFETCH;
 
 	ASSERT(pfd->pd_bytes_fetched >= 0);
 	if (zb->zb_level == ZB_DNODE_LEVEL)
 		return (0);
 	if (pfd->pd_cancel)
 		return (SET_ERROR(EINTR));
 
 	if (!prefetch_needed(pfd, bp))
 		return (0);
 
 	mutex_enter(&pfd->pd_mtx);
 	while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
 		cv_wait_sig(&pfd->pd_cv, &pfd->pd_mtx);
 	pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
 	cv_broadcast(&pfd->pd_cv);
 	mutex_exit(&pfd->pd_mtx);
 
 	if ((pfd->pd_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
 		zio_flags |= ZIO_FLAG_RAW;
 
 	(void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 	    zio_flags, &aflags, zb);
 
 	return (0);
 }
 
 static void
 traverse_prefetch_thread(void *arg)
 {
 	traverse_data_t *td_main = arg;
 	traverse_data_t td = *td_main;
 	zbookmark_phys_t czb;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	td.td_func = traverse_prefetcher;
 	td.td_arg = td_main->td_pfd;
 	td.td_pfd = NULL;
 	td.td_resume = &td_main->td_pfd->pd_resume;
 
 	SET_BOOKMARK(&czb, td.td_objset,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	(void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
 
 	mutex_enter(&td_main->td_pfd->pd_mtx);
 	td_main->td_pfd->pd_exited = B_TRUE;
 	cv_broadcast(&td_main->td_pfd->pd_cv);
 	mutex_exit(&td_main->td_pfd->pd_mtx);
 	spl_fstrans_unmark(cookie);
 }
 
 /*
  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
  * in syncing context).
  */
 static int
 traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
     uint64_t txg_start, zbookmark_phys_t *resume, int flags,
     blkptr_cb_t func, void *arg)
 {
 	traverse_data_t *td;
 	prefetch_data_t *pd;
 	zbookmark_phys_t *czb;
 	int err;
 
 	ASSERT(ds == NULL || objset == ds->ds_object);
 	ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
 
 	td = kmem_alloc(sizeof (traverse_data_t), KM_SLEEP);
 	pd = kmem_zalloc(sizeof (prefetch_data_t), KM_SLEEP);
 	czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP);
 
 	td->td_spa = spa;
 	td->td_objset = objset;
 	td->td_rootbp = rootbp;
 	td->td_min_txg = txg_start;
 	td->td_resume = resume;
 	td->td_func = func;
 	td->td_arg = arg;
 	td->td_pfd = pd;
 	td->td_flags = flags;
 	td->td_paused = B_FALSE;
 	td->td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE);
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 		VERIFY(spa_feature_enabled_txg(spa,
 		    SPA_FEATURE_HOLE_BIRTH, &td->td_hole_birth_enabled_txg));
 	} else {
 		td->td_hole_birth_enabled_txg = UINT64_MAX;
 	}
 
 	pd->pd_flags = flags;
 	if (resume != NULL)
 		pd->pd_resume = *resume;
 	mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);
 
 	SET_BOOKMARK(czb, td->td_objset,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
 	/* See comment on ZIL traversal in dsl_scan_visitds. */
 	if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
 		zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 		uint32_t flags = ARC_FLAG_WAIT;
 		objset_phys_t *osp;
 		arc_buf_t *buf;
 		ASSERT(!BP_IS_REDACTED(rootbp));
 
 		if ((td->td_flags & TRAVERSE_NO_DECRYPT) &&
 		    BP_IS_PROTECTED(rootbp))
 			zio_flags |= ZIO_FLAG_RAW;
 
 		err = arc_read(NULL, td->td_spa, rootbp, arc_getbuf_func,
 		    &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, czb);
 		if (err != 0) {
 			/*
 			 * If both TRAVERSE_HARD and TRAVERSE_PRE are set,
 			 * continue to visitbp so that td_func can be called
 			 * in pre stage, and err will reset to zero.
 			 */
 			if (!(td->td_flags & TRAVERSE_HARD) ||
 			    !(td->td_flags & TRAVERSE_PRE))
 				goto out;
 		} else {
 			osp = buf->b_data;
 			traverse_zil(td, &osp->os_zil_header);
 			arc_buf_destroy(buf, &buf);
 		}
 	}
 
 	if (!(flags & TRAVERSE_PREFETCH_DATA) ||
 	    taskq_dispatch(spa->spa_prefetch_taskq, traverse_prefetch_thread,
 	    td, TQ_NOQUEUE) == TASKQID_INVALID)
 		pd->pd_exited = B_TRUE;
 
 	err = traverse_visitbp(td, NULL, rootbp, czb);
 
 	mutex_enter(&pd->pd_mtx);
 	pd->pd_cancel = B_TRUE;
 	cv_broadcast(&pd->pd_cv);
 	while (!pd->pd_exited)
 		cv_wait_sig(&pd->pd_cv, &pd->pd_mtx);
 	mutex_exit(&pd->pd_mtx);
 out:
 	mutex_destroy(&pd->pd_mtx);
 	cv_destroy(&pd->pd_cv);
 
 	kmem_free(czb, sizeof (zbookmark_phys_t));
 	kmem_free(pd, sizeof (struct prefetch_data));
 	kmem_free(td, sizeof (struct traverse_data));
 
 	return (err);
 }
 
 /*
  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
  * in syncing context).
  */
 int
 traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
     zbookmark_phys_t *resume,
     int flags, blkptr_cb_t func, void *arg)
 {
 	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
 	    &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));
 }
 
 int
 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
     int flags, blkptr_cb_t func, void *arg)
 {
 	return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
 }
 
 int
 traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
     uint64_t txg_start, zbookmark_phys_t *resume, int flags,
     blkptr_cb_t func, void *arg)
 {
 	return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
 	    blkptr, txg_start, resume, flags, func, arg));
 }
 
 /*
  * NB: pool must not be changing on-disk (eg, from zdb or sync context).
  */
 int
 traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
     blkptr_cb_t func, void *arg)
 {
 	int err;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	objset_t *mos = dp->dp_meta_objset;
 	boolean_t hard = (flags & TRAVERSE_HARD);
 
 	/* visit the MOS */
 	err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
 	    txg_start, NULL, flags, func, arg);
 	if (err != 0)
 		return (err);
 
 	/* visit each dataset */
 	for (uint64_t obj = 1; err == 0;
 	    err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
 		dmu_object_info_t doi;
 
 		err = dmu_object_info(mos, obj, &doi);
 		if (err != 0) {
 			if (hard)
 				continue;
 			break;
 		}
 
 		if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
 			dsl_dataset_t *ds;
 			uint64_t txg = txg_start;
 
 			dsl_pool_config_enter(dp, FTAG);
 			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 			dsl_pool_config_exit(dp, FTAG);
 			if (err != 0) {
 				if (hard)
 					continue;
 				break;
 			}
 			if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
 				txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 			err = traverse_dataset(ds, txg, flags, func, arg);
 			dsl_dataset_rele(ds, FTAG);
 			if (err != 0)
 				break;
 		}
 	}
 	if (err == ESRCH)
 		err = 0;
 	return (err);
 }
 
 EXPORT_SYMBOL(traverse_dataset);
 EXPORT_SYMBOL(traverse_pool);
 
 ZFS_MODULE_PARAM(zfs, zfs_, pd_bytes_max, INT, ZMOD_RW,
 	"Max number of bytes to prefetch");
 
 ZFS_MODULE_PARAM(zfs, zfs_, traverse_indirect_prefetch_limit, UINT, ZMOD_RW,
 	"Traverse prefetch number of blocks pointed by indirect block");
 
 #if defined(_KERNEL)
 module_param_named(ignore_hole_birth, send_holes_without_birth_time, int, 0644);
 MODULE_PARM_DESC(ignore_hole_birth,
 	"Alias for send_holes_without_birth_time");
 #endif
 
 /* CSTYLED */
 ZFS_MODULE_PARAM(zfs, , send_holes_without_birth_time, INT, ZMOD_RW,
 	"Ignore hole_birth txg for zfs send");
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index ba28aa06a91f..a703fd414f87 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1,2735 +1,2735 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/range_tree.h>
 #include <sys/trace_zfs.h>
 #include <sys/zfs_project.h>
 
 dnode_stats_t dnode_stats = {
 	{ "dnode_hold_dbuf_hold",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_dbuf_read",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_alloc_hits",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_alloc_misses",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_alloc_interior",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_alloc_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "dnode_hold_alloc_lock_misses",	KSTAT_DATA_UINT64 },
 	{ "dnode_hold_alloc_type_none",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_free_hits",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_free_misses",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_free_lock_misses",	KSTAT_DATA_UINT64 },
 	{ "dnode_hold_free_lock_retry",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_free_overflow",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_free_refcount",		KSTAT_DATA_UINT64 },
 	{ "dnode_free_interior_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "dnode_allocate",			KSTAT_DATA_UINT64 },
 	{ "dnode_reallocate",			KSTAT_DATA_UINT64 },
 	{ "dnode_buf_evict",			KSTAT_DATA_UINT64 },
 	{ "dnode_alloc_next_chunk",		KSTAT_DATA_UINT64 },
 	{ "dnode_alloc_race",			KSTAT_DATA_UINT64 },
 	{ "dnode_alloc_next_block",		KSTAT_DATA_UINT64 },
 	{ "dnode_move_invalid",			KSTAT_DATA_UINT64 },
 	{ "dnode_move_recheck1",		KSTAT_DATA_UINT64 },
 	{ "dnode_move_recheck2",		KSTAT_DATA_UINT64 },
 	{ "dnode_move_special",			KSTAT_DATA_UINT64 },
 	{ "dnode_move_handle",			KSTAT_DATA_UINT64 },
 	{ "dnode_move_rwlock",			KSTAT_DATA_UINT64 },
 	{ "dnode_move_active",			KSTAT_DATA_UINT64 },
 };
 
 dnode_sums_t dnode_sums;
 
 static kstat_t *dnode_ksp;
 static kmem_cache_t *dnode_cache;
 
 static dnode_phys_t dnode_phys_zero __maybe_unused;
 
 int zfs_default_bs = SPA_MINBLOCKSHIFT;
 int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
 
 #ifdef	_KERNEL
 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
 #endif /* _KERNEL */
 
 static int
 dbuf_compare(const void *x1, const void *x2)
 {
 	const dmu_buf_impl_t *d1 = x1;
 	const dmu_buf_impl_t *d2 = x2;
 
 	int cmp = TREE_CMP(d1->db_level, d2->db_level);
 	if (likely(cmp))
 		return (cmp);
 
 	cmp = TREE_CMP(d1->db_blkid, d2->db_blkid);
 	if (likely(cmp))
 		return (cmp);
 
 	if (d1->db_state == DB_MARKER) {
 		ASSERT3S(d2->db_state, !=, DB_MARKER);
 		return (TREE_PCMP(d1->db_parent, d2));
 	} else if (d2->db_state == DB_MARKER) {
 		ASSERT3S(d1->db_state, !=, DB_MARKER);
 		return (TREE_PCMP(d1, d2->db_parent));
 	}
 
 	if (d1->db_state == DB_SEARCH) {
 		ASSERT3S(d2->db_state, !=, DB_SEARCH);
 		return (-1);
 	} else if (d2->db_state == DB_SEARCH) {
 		ASSERT3S(d1->db_state, !=, DB_SEARCH);
 		return (1);
 	}
 
 	return (TREE_PCMP(d1, d2));
 }
 
 static int
 dnode_cons(void *arg, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	dnode_t *dn = arg;
 
 	rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL);
 	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
 	cv_init(&dn->dn_nodnholds, NULL, CV_DEFAULT, NULL);
 
 	/*
 	 * Every dbuf has a reference, and dropping a tracked reference is
 	 * O(number of references), so don't track dn_holds.
 	 */
 	zfs_refcount_create_untracked(&dn->dn_holds);
 	zfs_refcount_create(&dn->dn_tx_holds);
 	list_link_init(&dn->dn_link);
 
 	memset(dn->dn_next_type, 0, sizeof (dn->dn_next_type));
 	memset(dn->dn_next_nblkptr, 0, sizeof (dn->dn_next_nblkptr));
 	memset(dn->dn_next_nlevels, 0, sizeof (dn->dn_next_nlevels));
 	memset(dn->dn_next_indblkshift, 0, sizeof (dn->dn_next_indblkshift));
 	memset(dn->dn_next_bonustype, 0, sizeof (dn->dn_next_bonustype));
 	memset(dn->dn_rm_spillblk, 0, sizeof (dn->dn_rm_spillblk));
 	memset(dn->dn_next_bonuslen, 0, sizeof (dn->dn_next_bonuslen));
 	memset(dn->dn_next_blksz, 0, sizeof (dn->dn_next_blksz));
 	memset(dn->dn_next_maxblkid, 0, sizeof (dn->dn_next_maxblkid));
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		multilist_link_init(&dn->dn_dirty_link[i]);
 		dn->dn_free_ranges[i] = NULL;
 		list_create(&dn->dn_dirty_records[i],
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
 
 	dn->dn_allocated_txg = 0;
 	dn->dn_free_txg = 0;
 	dn->dn_assigned_txg = 0;
 	dn->dn_dirty_txg = 0;
 	dn->dn_dirtyctx = 0;
 	dn->dn_dirtyctx_firstset = NULL;
 	dn->dn_bonus = NULL;
 	dn->dn_have_spill = B_FALSE;
 	dn->dn_zio = NULL;
 	dn->dn_oldused = 0;
 	dn->dn_oldflags = 0;
 	dn->dn_olduid = 0;
 	dn->dn_oldgid = 0;
 	dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
 	dn->dn_newuid = 0;
 	dn->dn_newgid = 0;
 	dn->dn_newprojid = ZFS_DEFAULT_PROJID;
 	dn->dn_id_flags = 0;
 
 	dn->dn_dbufs_count = 0;
 	avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 
 	dn->dn_moved = 0;
 	return (0);
 }
 
 static void
 dnode_dest(void *arg, void *unused)
 {
 	(void) unused;
 	dnode_t *dn = arg;
 
 	rw_destroy(&dn->dn_struct_rwlock);
 	mutex_destroy(&dn->dn_mtx);
 	mutex_destroy(&dn->dn_dbufs_mtx);
 	cv_destroy(&dn->dn_notxholds);
 	cv_destroy(&dn->dn_nodnholds);
 	zfs_refcount_destroy(&dn->dn_holds);
 	zfs_refcount_destroy(&dn->dn_tx_holds);
 	ASSERT(!list_link_active(&dn->dn_link));
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
 		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 		list_destroy(&dn->dn_dirty_records[i]);
 		ASSERT0(dn->dn_next_nblkptr[i]);
 		ASSERT0(dn->dn_next_nlevels[i]);
 		ASSERT0(dn->dn_next_indblkshift[i]);
 		ASSERT0(dn->dn_next_bonustype[i]);
 		ASSERT0(dn->dn_rm_spillblk[i]);
 		ASSERT0(dn->dn_next_bonuslen[i]);
 		ASSERT0(dn->dn_next_blksz[i]);
 		ASSERT0(dn->dn_next_maxblkid[i]);
 	}
 
 	ASSERT0(dn->dn_allocated_txg);
 	ASSERT0(dn->dn_free_txg);
 	ASSERT0(dn->dn_assigned_txg);
 	ASSERT0(dn->dn_dirty_txg);
 	ASSERT0(dn->dn_dirtyctx);
 	ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
 	ASSERT3P(dn->dn_bonus, ==, NULL);
 	ASSERT(!dn->dn_have_spill);
 	ASSERT3P(dn->dn_zio, ==, NULL);
 	ASSERT0(dn->dn_oldused);
 	ASSERT0(dn->dn_oldflags);
 	ASSERT0(dn->dn_olduid);
 	ASSERT0(dn->dn_oldgid);
 	ASSERT0(dn->dn_oldprojid);
 	ASSERT0(dn->dn_newuid);
 	ASSERT0(dn->dn_newgid);
 	ASSERT0(dn->dn_newprojid);
 	ASSERT0(dn->dn_id_flags);
 
 	ASSERT0(dn->dn_dbufs_count);
 	avl_destroy(&dn->dn_dbufs);
 }
 
 static int
 dnode_kstats_update(kstat_t *ksp, int rw)
 {
 	dnode_stats_t *ds = ksp->ks_data;
 
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 	ds->dnode_hold_dbuf_hold.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_dbuf_hold);
 	ds->dnode_hold_dbuf_read.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_dbuf_read);
 	ds->dnode_hold_alloc_hits.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_alloc_hits);
 	ds->dnode_hold_alloc_misses.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_alloc_misses);
 	ds->dnode_hold_alloc_interior.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_alloc_interior);
 	ds->dnode_hold_alloc_lock_retry.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_alloc_lock_retry);
 	ds->dnode_hold_alloc_lock_misses.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_alloc_lock_misses);
 	ds->dnode_hold_alloc_type_none.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_alloc_type_none);
 	ds->dnode_hold_free_hits.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_free_hits);
 	ds->dnode_hold_free_misses.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_free_misses);
 	ds->dnode_hold_free_lock_misses.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_free_lock_misses);
 	ds->dnode_hold_free_lock_retry.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_free_lock_retry);
 	ds->dnode_hold_free_refcount.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_free_refcount);
 	ds->dnode_hold_free_overflow.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_hold_free_overflow);
 	ds->dnode_free_interior_lock_retry.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_free_interior_lock_retry);
 	ds->dnode_allocate.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_allocate);
 	ds->dnode_reallocate.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_reallocate);
 	ds->dnode_buf_evict.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_buf_evict);
 	ds->dnode_alloc_next_chunk.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_alloc_next_chunk);
 	ds->dnode_alloc_race.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_alloc_race);
 	ds->dnode_alloc_next_block.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_alloc_next_block);
 	ds->dnode_move_invalid.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_move_invalid);
 	ds->dnode_move_recheck1.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_move_recheck1);
 	ds->dnode_move_recheck2.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_move_recheck2);
 	ds->dnode_move_special.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_move_special);
 	ds->dnode_move_handle.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_move_handle);
 	ds->dnode_move_rwlock.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_move_rwlock);
 	ds->dnode_move_active.value.ui64 =
 	    wmsum_value(&dnode_sums.dnode_move_active);
 	return (0);
 }
 
 void
 dnode_init(void)
 {
 	ASSERT(dnode_cache == NULL);
 	dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t),
 	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
 	kmem_cache_set_move(dnode_cache, dnode_move);
 
 	wmsum_init(&dnode_sums.dnode_hold_dbuf_hold, 0);
 	wmsum_init(&dnode_sums.dnode_hold_dbuf_read, 0);
 	wmsum_init(&dnode_sums.dnode_hold_alloc_hits, 0);
 	wmsum_init(&dnode_sums.dnode_hold_alloc_misses, 0);
 	wmsum_init(&dnode_sums.dnode_hold_alloc_interior, 0);
 	wmsum_init(&dnode_sums.dnode_hold_alloc_lock_retry, 0);
 	wmsum_init(&dnode_sums.dnode_hold_alloc_lock_misses, 0);
 	wmsum_init(&dnode_sums.dnode_hold_alloc_type_none, 0);
 	wmsum_init(&dnode_sums.dnode_hold_free_hits, 0);
 	wmsum_init(&dnode_sums.dnode_hold_free_misses, 0);
 	wmsum_init(&dnode_sums.dnode_hold_free_lock_misses, 0);
 	wmsum_init(&dnode_sums.dnode_hold_free_lock_retry, 0);
 	wmsum_init(&dnode_sums.dnode_hold_free_refcount, 0);
 	wmsum_init(&dnode_sums.dnode_hold_free_overflow, 0);
 	wmsum_init(&dnode_sums.dnode_free_interior_lock_retry, 0);
 	wmsum_init(&dnode_sums.dnode_allocate, 0);
 	wmsum_init(&dnode_sums.dnode_reallocate, 0);
 	wmsum_init(&dnode_sums.dnode_buf_evict, 0);
 	wmsum_init(&dnode_sums.dnode_alloc_next_chunk, 0);
 	wmsum_init(&dnode_sums.dnode_alloc_race, 0);
 	wmsum_init(&dnode_sums.dnode_alloc_next_block, 0);
 	wmsum_init(&dnode_sums.dnode_move_invalid, 0);
 	wmsum_init(&dnode_sums.dnode_move_recheck1, 0);
 	wmsum_init(&dnode_sums.dnode_move_recheck2, 0);
 	wmsum_init(&dnode_sums.dnode_move_special, 0);
 	wmsum_init(&dnode_sums.dnode_move_handle, 0);
 	wmsum_init(&dnode_sums.dnode_move_rwlock, 0);
 	wmsum_init(&dnode_sums.dnode_move_active, 0);
 
 	dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 	if (dnode_ksp != NULL) {
 		dnode_ksp->ks_data = &dnode_stats;
 		dnode_ksp->ks_update = dnode_kstats_update;
 		kstat_install(dnode_ksp);
 	}
 }
 
 void
 dnode_fini(void)
 {
 	if (dnode_ksp != NULL) {
 		kstat_delete(dnode_ksp);
 		dnode_ksp = NULL;
 	}
 
 	wmsum_fini(&dnode_sums.dnode_hold_dbuf_hold);
 	wmsum_fini(&dnode_sums.dnode_hold_dbuf_read);
 	wmsum_fini(&dnode_sums.dnode_hold_alloc_hits);
 	wmsum_fini(&dnode_sums.dnode_hold_alloc_misses);
 	wmsum_fini(&dnode_sums.dnode_hold_alloc_interior);
 	wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_retry);
 	wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_misses);
 	wmsum_fini(&dnode_sums.dnode_hold_alloc_type_none);
 	wmsum_fini(&dnode_sums.dnode_hold_free_hits);
 	wmsum_fini(&dnode_sums.dnode_hold_free_misses);
 	wmsum_fini(&dnode_sums.dnode_hold_free_lock_misses);
 	wmsum_fini(&dnode_sums.dnode_hold_free_lock_retry);
 	wmsum_fini(&dnode_sums.dnode_hold_free_refcount);
 	wmsum_fini(&dnode_sums.dnode_hold_free_overflow);
 	wmsum_fini(&dnode_sums.dnode_free_interior_lock_retry);
 	wmsum_fini(&dnode_sums.dnode_allocate);
 	wmsum_fini(&dnode_sums.dnode_reallocate);
 	wmsum_fini(&dnode_sums.dnode_buf_evict);
 	wmsum_fini(&dnode_sums.dnode_alloc_next_chunk);
 	wmsum_fini(&dnode_sums.dnode_alloc_race);
 	wmsum_fini(&dnode_sums.dnode_alloc_next_block);
 	wmsum_fini(&dnode_sums.dnode_move_invalid);
 	wmsum_fini(&dnode_sums.dnode_move_recheck1);
 	wmsum_fini(&dnode_sums.dnode_move_recheck2);
 	wmsum_fini(&dnode_sums.dnode_move_special);
 	wmsum_fini(&dnode_sums.dnode_move_handle);
 	wmsum_fini(&dnode_sums.dnode_move_rwlock);
 	wmsum_fini(&dnode_sums.dnode_move_active);
 
 	kmem_cache_destroy(dnode_cache);
 	dnode_cache = NULL;
 }
 
 
 #ifdef ZFS_DEBUG
 void
 dnode_verify(dnode_t *dn)
 {
 	int drop_struct_lock = FALSE;
 
 	ASSERT(dn->dn_phys);
 	ASSERT(dn->dn_objset);
 	ASSERT(dn->dn_handle->dnh_dnode == dn);
 
 	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 
 	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
 		return;
 
 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		drop_struct_lock = TRUE;
 	}
 	if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
 		int i;
 		int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
 		ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
 		if (dn->dn_datablkshift) {
 			ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
 			ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
 			ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
 		}
 		ASSERT3U(dn->dn_nlevels, <=, 30);
 		ASSERT(DMU_OT_IS_VALID(dn->dn_type));
 		ASSERT3U(dn->dn_nblkptr, >=, 1);
 		ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 		ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
 		ASSERT3U(dn->dn_datablksz, ==,
 		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 		ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
 		ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
 		    dn->dn_bonuslen, <=, max_bonuslen);
 		for (i = 0; i < TXG_SIZE; i++) {
 			ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
 		}
 	}
 	if (dn->dn_phys->dn_type != DMU_OT_NONE)
 		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
 	ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
 	if (dn->dn_dbuf != NULL) {
 		ASSERT3P(dn->dn_phys, ==,
 		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
 		    (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
 	}
 	if (drop_struct_lock)
 		rw_exit(&dn->dn_struct_rwlock);
 }
 #endif
 
 void
 dnode_byteswap(dnode_phys_t *dnp)
 {
 	uint64_t *buf64 = (void*)&dnp->dn_blkptr;
 	int i;
 
 	if (dnp->dn_type == DMU_OT_NONE) {
 		memset(dnp, 0, sizeof (dnode_phys_t));
 		return;
 	}
 
 	dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
 	dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
 	dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
 	dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
 	dnp->dn_used = BSWAP_64(dnp->dn_used);
 
 	/*
 	 * dn_nblkptr is only one byte, so it's OK to read it in either
 	 * byte order.  We can't read dn_bouslen.
 	 */
 	ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
 	ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
 	for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
 		buf64[i] = BSWAP_64(buf64[i]);
 
 	/*
 	 * OK to check dn_bonuslen for zero, because it won't matter if
 	 * we have the wrong byte order.  This is necessary because the
 	 * dnode dnode is smaller than a regular dnode.
 	 */
 	if (dnp->dn_bonuslen != 0) {
 		dmu_object_byteswap_t byteswap;
 		ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
 		byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
 		dmu_ot_byteswap[byteswap].ob_func(DN_BONUS(dnp),
 		    DN_MAX_BONUS_LEN(dnp));
 	}
 
 	/* Swap SPILL block if we have one */
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
 		byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
 }
 
 void
 dnode_buf_byteswap(void *vbuf, size_t size)
 {
 	int i = 0;
 
 	ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
 	ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
 
 	while (i < size) {
 		dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
 		dnode_byteswap(dnp);
 
 		i += DNODE_MIN_SIZE;
 		if (dnp->dn_type != DMU_OT_NONE)
 			i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
 	}
 }
 
 void
 dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 {
 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
 
 	dnode_setdirty(dn, tx);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
 	    (dn->dn_nblkptr-1) * sizeof (blkptr_t));
 
 	if (newsize < dn->dn_bonuslen) {
 		/* clear any data after the end of the new size */
 		size_t diff = dn->dn_bonuslen - newsize;
 		char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize;
 		memset(data_end, 0, diff);
 	}
 
 	dn->dn_bonuslen = newsize;
 	if (newsize == 0)
 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
 	else
 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 void
 dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
 {
 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
 	dnode_setdirty(dn, tx);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dn->dn_bonustype = newtype;
 	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 void
 dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 {
 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 	dnode_setdirty(dn, tx);
 	dn->dn_rm_spillblk[tx->tx_txg & TXG_MASK] = DN_KILL_SPILLBLK;
 	dn->dn_have_spill = B_FALSE;
 }
 
 static void
 dnode_setdblksz(dnode_t *dn, int size)
 {
 	ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 	ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
 	ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
 	    1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
 	dn->dn_datablksz = size;
 	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
 	dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
 }
 
 static dnode_t *
 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
     uint64_t object, dnode_handle_t *dnh)
 {
 	dnode_t *dn;
 
 	dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
 	dn->dn_moved = 0;
 
 	/*
 	 * Defer setting dn_objset until the dnode is ready to be a candidate
 	 * for the dnode_move() callback.
 	 */
 	dn->dn_object = object;
 	dn->dn_dbuf = db;
 	dn->dn_handle = dnh;
 	dn->dn_phys = dnp;
 
 	if (dnp->dn_datablkszsec) {
 		dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 	} else {
 		dn->dn_datablksz = 0;
 		dn->dn_datablkszsec = 0;
 		dn->dn_datablkshift = 0;
 	}
 	dn->dn_indblkshift = dnp->dn_indblkshift;
 	dn->dn_nlevels = dnp->dn_nlevels;
 	dn->dn_type = dnp->dn_type;
 	dn->dn_nblkptr = dnp->dn_nblkptr;
 	dn->dn_checksum = dnp->dn_checksum;
 	dn->dn_compress = dnp->dn_compress;
 	dn->dn_bonustype = dnp->dn_bonustype;
 	dn->dn_bonuslen = dnp->dn_bonuslen;
 	dn->dn_num_slots = dnp->dn_extra_slots + 1;
 	dn->dn_maxblkid = dnp->dn_maxblkid;
 	dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
 	dn->dn_id_flags = 0;
 
 	dmu_zfetch_init(&dn->dn_zfetch, dn);
 
 	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 	ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
 	ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
 
 	mutex_enter(&os->os_lock);
 
 	/*
 	 * Exclude special dnodes from os_dnodes so an empty os_dnodes
 	 * signifies that the special dnodes have no references from
 	 * their children (the entries in os_dnodes).  This allows
 	 * dnode_destroy() to easily determine if the last child has
 	 * been removed and then complete eviction of the objset.
 	 */
 	if (!DMU_OBJECT_IS_SPECIAL(object))
 		list_insert_head(&os->os_dnodes, dn);
 	membar_producer();
 
 	/*
 	 * Everything else must be valid before assigning dn_objset
 	 * makes the dnode eligible for dnode_move().
 	 */
 	dn->dn_objset = os;
 
 	dnh->dnh_dnode = dn;
 	mutex_exit(&os->os_lock);
 
 	arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
 
 	return (dn);
 }
 
 /*
  * Caller must be holding the dnode handle, which is released upon return.
  */
 static void
 dnode_destroy(dnode_t *dn)
 {
 	objset_t *os = dn->dn_objset;
 	boolean_t complete_os_eviction = B_FALSE;
 
 	ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 
 	mutex_enter(&os->os_lock);
 	POINTER_INVALIDATE(&dn->dn_objset);
 	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 		list_remove(&os->os_dnodes, dn);
 		complete_os_eviction =
 		    list_is_empty(&os->os_dnodes) &&
 		    list_link_active(&os->os_evicting_node);
 	}
 	mutex_exit(&os->os_lock);
 
 	/* the dnode can no longer move, so we can release the handle */
 	if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
 		zrl_remove(&dn->dn_handle->dnh_zrlock);
 
 	dn->dn_allocated_txg = 0;
 	dn->dn_free_txg = 0;
 	dn->dn_assigned_txg = 0;
 	dn->dn_dirty_txg = 0;
 
 	dn->dn_dirtyctx = 0;
 	dn->dn_dirtyctx_firstset = NULL;
 	if (dn->dn_bonus != NULL) {
 		mutex_enter(&dn->dn_bonus->db_mtx);
 		dbuf_destroy(dn->dn_bonus);
 		dn->dn_bonus = NULL;
 	}
 	dn->dn_zio = NULL;
 
 	dn->dn_have_spill = B_FALSE;
 	dn->dn_oldused = 0;
 	dn->dn_oldflags = 0;
 	dn->dn_olduid = 0;
 	dn->dn_oldgid = 0;
 	dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
 	dn->dn_newuid = 0;
 	dn->dn_newgid = 0;
 	dn->dn_newprojid = ZFS_DEFAULT_PROJID;
 	dn->dn_id_flags = 0;
 
 	dmu_zfetch_fini(&dn->dn_zfetch);
 	kmem_cache_free(dnode_cache, dn);
 	arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);
 
 	if (complete_os_eviction)
 		dmu_objset_evict_done(os);
 }
 
 void
 dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
     dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
 {
 	int i;
 
 	ASSERT3U(dn_slots, >, 0);
 	ASSERT3U(dn_slots << DNODE_SHIFT, <=,
 	    spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
 	ASSERT3U(blocksize, <=,
 	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	if (blocksize == 0)
 		blocksize = 1 << zfs_default_bs;
 	else
 		blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 
 	if (ibs == 0)
 		ibs = zfs_default_ibs;
 
 	ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
 
 	dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
 	    dn->dn_objset, (u_longlong_t)dn->dn_object,
 	    (u_longlong_t)tx->tx_txg, blocksize, ibs, dn_slots);
 	DNODE_STAT_BUMP(dnode_allocate);
 
 	ASSERT(dn->dn_type == DMU_OT_NONE);
 	ASSERT0(memcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)));
 	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
 	ASSERT(ot != DMU_OT_NONE);
 	ASSERT(DMU_OT_IS_VALID(ot));
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 	    (bonustype == DMU_OT_SA && bonuslen == 0) ||
 	    (bonustype == DMU_OTN_UINT64_METADATA && bonuslen == 0) ||
 	    (bonustype != DMU_OT_NONE && bonuslen != 0));
 	ASSERT(DMU_OT_IS_VALID(bonustype));
 	ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
 	ASSERT(dn->dn_type == DMU_OT_NONE);
 	ASSERT0(dn->dn_maxblkid);
 	ASSERT0(dn->dn_allocated_txg);
 	ASSERT0(dn->dn_assigned_txg);
 	ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);
 	ASSERT(avl_is_empty(&dn->dn_dbufs));
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		ASSERT0(dn->dn_next_nblkptr[i]);
 		ASSERT0(dn->dn_next_nlevels[i]);
 		ASSERT0(dn->dn_next_indblkshift[i]);
 		ASSERT0(dn->dn_next_bonuslen[i]);
 		ASSERT0(dn->dn_next_bonustype[i]);
 		ASSERT0(dn->dn_rm_spillblk[i]);
 		ASSERT0(dn->dn_next_blksz[i]);
 		ASSERT0(dn->dn_next_maxblkid[i]);
 		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
 		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
 		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 	}
 
 	dn->dn_type = ot;
 	dnode_setdblksz(dn, blocksize);
 	dn->dn_indblkshift = ibs;
 	dn->dn_nlevels = 1;
 	dn->dn_num_slots = dn_slots;
 	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
 		dn->dn_nblkptr = 1;
 	else {
 		dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
 		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
 		    SPA_BLKPTRSHIFT));
 	}
 
 	dn->dn_bonustype = bonustype;
 	dn->dn_bonuslen = bonuslen;
 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
 	dn->dn_dirtyctx = 0;
 
 	dn->dn_free_txg = 0;
 	dn->dn_dirtyctx_firstset = NULL;
 	dn->dn_dirty_txg = 0;
 
 	dn->dn_allocated_txg = tx->tx_txg;
 	dn->dn_id_flags = 0;
 
 	dnode_setdirty(dn, tx);
 	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
 	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
 	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
 	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
 }
 
 void
 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
     dmu_object_type_t bonustype, int bonuslen, int dn_slots,
     boolean_t keep_spill, dmu_tx_t *tx)
 {
 	int nblkptr;
 
 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
 	ASSERT3U(blocksize, <=,
 	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	ASSERT0(blocksize % SPA_MINBLOCKSIZE);
 	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 	ASSERT(tx->tx_txg != 0);
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 	    (bonustype != DMU_OT_NONE && bonuslen != 0) ||
 	    (bonustype == DMU_OT_SA && bonuslen == 0));
 	ASSERT(DMU_OT_IS_VALID(bonustype));
 	ASSERT3U(bonuslen, <=,
 	    DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
 	ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
 
 	dnode_free_interior_slots(dn);
 	DNODE_STAT_BUMP(dnode_reallocate);
 
 	/* clean up any unreferenced dbufs */
 	dnode_evict_dbufs(dn);
 
 	dn->dn_id_flags = 0;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_setdirty(dn, tx);
 	if (dn->dn_datablksz != blocksize) {
 		/* change blocksize */
 		ASSERT0(dn->dn_maxblkid);
 		ASSERT(BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
 		    dnode_block_freed(dn, 0));
 
 		dnode_setdblksz(dn, blocksize);
 		dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = blocksize;
 	}
 	if (dn->dn_bonuslen != bonuslen)
 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = bonuslen;
 
 	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
 		nblkptr = 1;
 	else
 		nblkptr = MIN(DN_MAX_NBLKPTR,
 		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
 		    SPA_BLKPTRSHIFT));
 	if (dn->dn_bonustype != bonustype)
 		dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = bonustype;
 	if (dn->dn_nblkptr != nblkptr)
 		dn->dn_next_nblkptr[tx->tx_txg & TXG_MASK] = nblkptr;
 	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) {
 		dbuf_rm_spill(dn, tx);
 		dnode_rm_spill(dn, tx);
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/* change type */
 	dn->dn_type = ot;
 
 	/* change bonus size and type */
 	mutex_enter(&dn->dn_mtx);
 	dn->dn_bonustype = bonustype;
 	dn->dn_bonuslen = bonuslen;
 	dn->dn_num_slots = dn_slots;
 	dn->dn_nblkptr = nblkptr;
 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
 	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 
 	/* fix up the bonus db_size */
 	if (dn->dn_bonus) {
 		dn->dn_bonus->db.db_size =
 		    DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
 		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
 		ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
 	}
 
 	dn->dn_allocated_txg = tx->tx_txg;
 	mutex_exit(&dn->dn_mtx);
 }
 
 #ifdef	_KERNEL
 static void
 dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 {
 	ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
 	ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
 	ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
 
 	/* Copy fields. */
 	ndn->dn_objset = odn->dn_objset;
 	ndn->dn_object = odn->dn_object;
 	ndn->dn_dbuf = odn->dn_dbuf;
 	ndn->dn_handle = odn->dn_handle;
 	ndn->dn_phys = odn->dn_phys;
 	ndn->dn_type = odn->dn_type;
 	ndn->dn_bonuslen = odn->dn_bonuslen;
 	ndn->dn_bonustype = odn->dn_bonustype;
 	ndn->dn_nblkptr = odn->dn_nblkptr;
 	ndn->dn_checksum = odn->dn_checksum;
 	ndn->dn_compress = odn->dn_compress;
 	ndn->dn_nlevels = odn->dn_nlevels;
 	ndn->dn_indblkshift = odn->dn_indblkshift;
 	ndn->dn_datablkshift = odn->dn_datablkshift;
 	ndn->dn_datablkszsec = odn->dn_datablkszsec;
 	ndn->dn_datablksz = odn->dn_datablksz;
 	ndn->dn_maxblkid = odn->dn_maxblkid;
 	ndn->dn_num_slots = odn->dn_num_slots;
 	memcpy(ndn->dn_next_type, odn->dn_next_type,
 	    sizeof (odn->dn_next_type));
 	memcpy(ndn->dn_next_nblkptr, odn->dn_next_nblkptr,
 	    sizeof (odn->dn_next_nblkptr));
 	memcpy(ndn->dn_next_nlevels, odn->dn_next_nlevels,
 	    sizeof (odn->dn_next_nlevels));
 	memcpy(ndn->dn_next_indblkshift, odn->dn_next_indblkshift,
 	    sizeof (odn->dn_next_indblkshift));
 	memcpy(ndn->dn_next_bonustype, odn->dn_next_bonustype,
 	    sizeof (odn->dn_next_bonustype));
 	memcpy(ndn->dn_rm_spillblk, odn->dn_rm_spillblk,
 	    sizeof (odn->dn_rm_spillblk));
 	memcpy(ndn->dn_next_bonuslen, odn->dn_next_bonuslen,
 	    sizeof (odn->dn_next_bonuslen));
 	memcpy(ndn->dn_next_blksz, odn->dn_next_blksz,
 	    sizeof (odn->dn_next_blksz));
 	memcpy(ndn->dn_next_maxblkid, odn->dn_next_maxblkid,
 	    sizeof (odn->dn_next_maxblkid));
 	for (int i = 0; i < TXG_SIZE; i++) {
 		list_move_tail(&ndn->dn_dirty_records[i],
 		    &odn->dn_dirty_records[i]);
 	}
 	memcpy(ndn->dn_free_ranges, odn->dn_free_ranges,
 	    sizeof (odn->dn_free_ranges));
 	ndn->dn_allocated_txg = odn->dn_allocated_txg;
 	ndn->dn_free_txg = odn->dn_free_txg;
 	ndn->dn_assigned_txg = odn->dn_assigned_txg;
 	ndn->dn_dirty_txg = odn->dn_dirty_txg;
 	ndn->dn_dirtyctx = odn->dn_dirtyctx;
 	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
 	ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0);
 	zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
 	ASSERT(avl_is_empty(&ndn->dn_dbufs));
 	avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
 	ndn->dn_dbufs_count = odn->dn_dbufs_count;
 	ndn->dn_bonus = odn->dn_bonus;
 	ndn->dn_have_spill = odn->dn_have_spill;
 	ndn->dn_zio = odn->dn_zio;
 	ndn->dn_oldused = odn->dn_oldused;
 	ndn->dn_oldflags = odn->dn_oldflags;
 	ndn->dn_olduid = odn->dn_olduid;
 	ndn->dn_oldgid = odn->dn_oldgid;
 	ndn->dn_oldprojid = odn->dn_oldprojid;
 	ndn->dn_newuid = odn->dn_newuid;
 	ndn->dn_newgid = odn->dn_newgid;
 	ndn->dn_newprojid = odn->dn_newprojid;
 	ndn->dn_id_flags = odn->dn_id_flags;
 	dmu_zfetch_init(&ndn->dn_zfetch, ndn);
 
 	/*
 	 * Update back pointers. Updating the handle fixes the back pointer of
 	 * every descendant dbuf as well as the bonus dbuf.
 	 */
 	ASSERT(ndn->dn_handle->dnh_dnode == odn);
 	ndn->dn_handle->dnh_dnode = ndn;
 
 	/*
 	 * Invalidate the original dnode by clearing all of its back pointers.
 	 */
 	odn->dn_dbuf = NULL;
 	odn->dn_handle = NULL;
 	avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 	odn->dn_dbufs_count = 0;
 	odn->dn_bonus = NULL;
 	dmu_zfetch_fini(&odn->dn_zfetch);
 
 	/*
 	 * Set the low bit of the objset pointer to ensure that dnode_move()
 	 * recognizes the dnode as invalid in any subsequent callback.
 	 */
 	POINTER_INVALIDATE(&odn->dn_objset);
 
 	/*
 	 * Satisfy the destructor.
 	 */
 	for (int i = 0; i < TXG_SIZE; i++) {
 		list_create(&odn->dn_dirty_records[i],
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 		odn->dn_free_ranges[i] = NULL;
 		odn->dn_next_nlevels[i] = 0;
 		odn->dn_next_indblkshift[i] = 0;
 		odn->dn_next_bonustype[i] = 0;
 		odn->dn_rm_spillblk[i] = 0;
 		odn->dn_next_bonuslen[i] = 0;
 		odn->dn_next_blksz[i] = 0;
 	}
 	odn->dn_allocated_txg = 0;
 	odn->dn_free_txg = 0;
 	odn->dn_assigned_txg = 0;
 	odn->dn_dirty_txg = 0;
 	odn->dn_dirtyctx = 0;
 	odn->dn_dirtyctx_firstset = NULL;
 	odn->dn_have_spill = B_FALSE;
 	odn->dn_zio = NULL;
 	odn->dn_oldused = 0;
 	odn->dn_oldflags = 0;
 	odn->dn_olduid = 0;
 	odn->dn_oldgid = 0;
 	odn->dn_oldprojid = ZFS_DEFAULT_PROJID;
 	odn->dn_newuid = 0;
 	odn->dn_newgid = 0;
 	odn->dn_newprojid = ZFS_DEFAULT_PROJID;
 	odn->dn_id_flags = 0;
 
 	/*
 	 * Mark the dnode.
 	 */
 	ndn->dn_moved = 1;
 	odn->dn_moved = (uint8_t)-1;
 }
 
 static kmem_cbrc_t
 dnode_move(void *buf, void *newbuf, size_t size, void *arg)
 {
 	dnode_t *odn = buf, *ndn = newbuf;
 	objset_t *os;
 	int64_t refcount;
 	uint32_t dbufs;
 
 	/*
 	 * The dnode is on the objset's list of known dnodes if the objset
 	 * pointer is valid. We set the low bit of the objset pointer when
 	 * freeing the dnode to invalidate it, and the memory patterns written
 	 * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
 	 * A newly created dnode sets the objset pointer last of all to indicate
 	 * that the dnode is known and in a valid state to be moved by this
 	 * function.
 	 */
 	os = odn->dn_objset;
 	if (!POINTER_IS_VALID(os)) {
 		DNODE_STAT_BUMP(dnode_move_invalid);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * Ensure that the objset does not go away during the move.
 	 */
 	rw_enter(&os_lock, RW_WRITER);
 	if (os != odn->dn_objset) {
 		rw_exit(&os_lock);
 		DNODE_STAT_BUMP(dnode_move_recheck1);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * If the dnode is still valid, then so is the objset. We know that no
 	 * valid objset can be freed while we hold os_lock, so we can safely
 	 * ensure that the objset remains in use.
 	 */
 	mutex_enter(&os->os_lock);
 
 	/*
 	 * Recheck the objset pointer in case the dnode was removed just before
 	 * acquiring the lock.
 	 */
 	if (os != odn->dn_objset) {
 		mutex_exit(&os->os_lock);
 		rw_exit(&os_lock);
 		DNODE_STAT_BUMP(dnode_move_recheck2);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * At this point we know that as long as we hold os->os_lock, the dnode
 	 * cannot be freed and fields within the dnode can be safely accessed.
 	 * The objset listing this dnode cannot go away as long as this dnode is
 	 * on its list.
 	 */
 	rw_exit(&os_lock);
 	if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
 		mutex_exit(&os->os_lock);
 		DNODE_STAT_BUMP(dnode_move_special);
 		return (KMEM_CBRC_NO);
 	}
 	ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
 
 	/*
 	 * Lock the dnode handle to prevent the dnode from obtaining any new
 	 * holds. This also prevents the descendant dbufs and the bonus dbuf
 	 * from accessing the dnode, so that we can discount their holds. The
 	 * handle is safe to access because we know that while the dnode cannot
 	 * go away, neither can its handle. Once we hold dnh_zrlock, we can
 	 * safely move any dnode referenced only by dbufs.
 	 */
 	if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
 		mutex_exit(&os->os_lock);
 		DNODE_STAT_BUMP(dnode_move_handle);
 		return (KMEM_CBRC_LATER);
 	}
 
 	/*
 	 * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
 	 * We need to guarantee that there is a hold for every dbuf in order to
 	 * determine whether the dnode is actively referenced. Falsely matching
 	 * a dbuf to an active hold would lead to an unsafe move. It's possible
 	 * that a thread already having an active dnode hold is about to add a
 	 * dbuf, and we can't compare hold and dbuf counts while the add is in
 	 * progress.
 	 */
 	if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
 		zrl_exit(&odn->dn_handle->dnh_zrlock);
 		mutex_exit(&os->os_lock);
 		DNODE_STAT_BUMP(dnode_move_rwlock);
 		return (KMEM_CBRC_LATER);
 	}
 
 	/*
 	 * A dbuf may be removed (evicted) without an active dnode hold. In that
 	 * case, the dbuf count is decremented under the handle lock before the
 	 * dbuf's hold is released. This order ensures that if we count the hold
 	 * after the dbuf is removed but before its hold is released, we will
 	 * treat the unmatched hold as active and exit safely. If we count the
 	 * hold before the dbuf is removed, the hold is discounted, and the
 	 * removal is blocked until the move completes.
 	 */
 	refcount = zfs_refcount_count(&odn->dn_holds);
 	ASSERT(refcount >= 0);
 	dbufs = DN_DBUFS_COUNT(odn);
 
 	/* We can't have more dbufs than dnode holds. */
 	ASSERT3U(dbufs, <=, refcount);
 	DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
 	    uint32_t, dbufs);
 
 	if (refcount > dbufs) {
 		rw_exit(&odn->dn_struct_rwlock);
 		zrl_exit(&odn->dn_handle->dnh_zrlock);
 		mutex_exit(&os->os_lock);
 		DNODE_STAT_BUMP(dnode_move_active);
 		return (KMEM_CBRC_LATER);
 	}
 
 	rw_exit(&odn->dn_struct_rwlock);
 
 	/*
 	 * At this point we know that anyone with a hold on the dnode is not
 	 * actively referencing it. The dnode is known and in a valid state to
 	 * move. We're holding the locks needed to execute the critical section.
 	 */
 	dnode_move_impl(odn, ndn);
 
 	list_link_replace(&odn->dn_link, &ndn->dn_link);
 	/* If the dnode was safe to move, the refcount cannot have changed. */
 	ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds));
 	ASSERT(dbufs == DN_DBUFS_COUNT(ndn));
 	zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
 	mutex_exit(&os->os_lock);
 
 	return (KMEM_CBRC_YES);
 }
 #endif	/* _KERNEL */
 
 static void
 dnode_slots_hold(dnode_children_t *children, int idx, int slots)
 {
 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
 	for (int i = idx; i < idx + slots; i++) {
 		dnode_handle_t *dnh = &children->dnc_children[i];
 		zrl_add(&dnh->dnh_zrlock);
 	}
 }
 
 static void
 dnode_slots_rele(dnode_children_t *children, int idx, int slots)
 {
 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
 	for (int i = idx; i < idx + slots; i++) {
 		dnode_handle_t *dnh = &children->dnc_children[i];
 
 		if (zrl_is_locked(&dnh->dnh_zrlock))
 			zrl_exit(&dnh->dnh_zrlock);
 		else
 			zrl_remove(&dnh->dnh_zrlock);
 	}
 }
 
 static int
 dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
 {
 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
 	for (int i = idx; i < idx + slots; i++) {
 		dnode_handle_t *dnh = &children->dnc_children[i];
 
 		if (!zrl_tryenter(&dnh->dnh_zrlock)) {
 			for (int j = idx; j < i; j++) {
 				dnh = &children->dnc_children[j];
 				zrl_exit(&dnh->dnh_zrlock);
 			}
 
 			return (0);
 		}
 	}
 
 	return (1);
 }
 
 static void
 dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
 {
 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
 	for (int i = idx; i < idx + slots; i++) {
 		dnode_handle_t *dnh = &children->dnc_children[i];
 		dnh->dnh_dnode = ptr;
 	}
 }
 
 static boolean_t
 dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
 {
 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
 	/*
 	 * If all dnode slots are either already free or
 	 * evictable return B_TRUE.
 	 */
 	for (int i = idx; i < idx + slots; i++) {
 		dnode_handle_t *dnh = &children->dnc_children[i];
 		dnode_t *dn = dnh->dnh_dnode;
 
 		if (dn == DN_SLOT_FREE) {
 			continue;
 		} else if (DN_SLOT_IS_PTR(dn)) {
 			mutex_enter(&dn->dn_mtx);
 			boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
 			    zfs_refcount_is_zero(&dn->dn_holds) &&
 			    !DNODE_IS_DIRTY(dn));
 			mutex_exit(&dn->dn_mtx);
 
 			if (!can_free)
 				return (B_FALSE);
 			else
 				continue;
 		} else {
 			return (B_FALSE);
 		}
 	}
 
 	return (B_TRUE);
 }
 
 static uint_t
 dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
 {
 	uint_t reclaimed = 0;
 
 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
 	for (int i = idx; i < idx + slots; i++) {
 		dnode_handle_t *dnh = &children->dnc_children[i];
 
 		ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
 
 		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
 			ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
 			dnode_destroy(dnh->dnh_dnode);
 			dnh->dnh_dnode = DN_SLOT_FREE;
 			reclaimed++;
 		}
 	}
 
 	return (reclaimed);
 }
 
 void
 dnode_free_interior_slots(dnode_t *dn)
 {
 	dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
 	int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
 	int idx = (dn->dn_object & (epb - 1)) + 1;
 	int slots = dn->dn_num_slots - 1;
 
 	if (slots == 0)
 		return;
 
 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
 	while (!dnode_slots_tryenter(children, idx, slots)) {
 		DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
 		kpreempt(KPREEMPT_SYNC);
 	}
 
 	dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
 	dnode_slots_rele(children, idx, slots);
 }
 
 void
 dnode_special_close(dnode_handle_t *dnh)
 {
 	dnode_t *dn = dnh->dnh_dnode;
 
 	/*
 	 * Ensure dnode_rele_and_unlock() has released dn_mtx, after final
 	 * zfs_refcount_remove()
 	 */
 	mutex_enter(&dn->dn_mtx);
 	if (zfs_refcount_count(&dn->dn_holds) > 0)
 		cv_wait(&dn->dn_nodnholds, &dn->dn_mtx);
 	mutex_exit(&dn->dn_mtx);
 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 0);
 
 	ASSERT(dn->dn_dbuf == NULL ||
 	    dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
 	zrl_add(&dnh->dnh_zrlock);
 	dnode_destroy(dn); /* implicit zrl_remove() */
 	zrl_destroy(&dnh->dnh_zrlock);
 	dnh->dnh_dnode = NULL;
 }
 
 void
 dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
     dnode_handle_t *dnh)
 {
 	dnode_t *dn;
 
 	zrl_init(&dnh->dnh_zrlock);
 	VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock));
 
 	dn = dnode_create(os, dnp, NULL, object, dnh);
 	DNODE_VERIFY(dn);
 
 	zrl_exit(&dnh->dnh_zrlock);
 }
 
 static void
 dnode_buf_evict_async(void *dbu)
 {
 	dnode_children_t *dnc = dbu;
 
 	DNODE_STAT_BUMP(dnode_buf_evict);
 
 	for (int i = 0; i < dnc->dnc_count; i++) {
 		dnode_handle_t *dnh = &dnc->dnc_children[i];
 		dnode_t *dn;
 
 		/*
 		 * The dnode handle lock guards against the dnode moving to
 		 * another valid address, so there is no need here to guard
 		 * against changes to or from NULL.
 		 */
 		if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
 			zrl_destroy(&dnh->dnh_zrlock);
 			dnh->dnh_dnode = DN_SLOT_UNINIT;
 			continue;
 		}
 
 		zrl_add(&dnh->dnh_zrlock);
 		dn = dnh->dnh_dnode;
 		/*
 		 * If there are holds on this dnode, then there should
 		 * be holds on the dnode's containing dbuf as well; thus
 		 * it wouldn't be eligible for eviction and this function
 		 * would not have been called.
 		 */
 		ASSERT(zfs_refcount_is_zero(&dn->dn_holds));
 		ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
 
 		dnode_destroy(dn); /* implicit zrl_remove() for first slot */
 		zrl_destroy(&dnh->dnh_zrlock);
 		dnh->dnh_dnode = DN_SLOT_UNINIT;
 	}
 	kmem_free(dnc, sizeof (dnode_children_t) +
 	    dnc->dnc_count * sizeof (dnode_handle_t));
 }
 
 /*
  * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used
  * to ensure the hole at the specified object offset is large enough to
  * hold the dnode being created. The slots parameter is also used to ensure
  * a dnode does not span multiple dnode blocks. In both of these cases, if
  * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases
  * are only possible when using DNODE_MUST_BE_FREE.
  *
  * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
  * dnode_hold_impl() will check if the requested dnode is already consumed
  * as an extra dnode slot by an large dnode, in which case it returns
  * ENOENT.
  *
  * If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just
  * return whether the hold would succeed or not. tag and dnp should set to
  * NULL in this case.
  *
  * errors:
  * EINVAL - Invalid object number or flags.
  * ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
  * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
  *        - Refers to a freeing dnode (DNODE_MUST_BE_FREE)
  *        - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
  * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
  *        - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)
  * EIO    - I/O error when reading the meta dnode dbuf.
  *
  * succeeds even for free dnodes.
  */
 int
 dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
     const void *tag, dnode_t **dnp)
 {
 	int epb, idx, err;
 	int drop_struct_lock = FALSE;
 	int type;
 	uint64_t blk;
 	dnode_t *mdn, *dn;
 	dmu_buf_impl_t *db;
 	dnode_children_t *dnc;
 	dnode_phys_t *dn_block;
 	dnode_handle_t *dnh;
 
 	ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
 	ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
 	IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL));
 
 	/*
 	 * If you are holding the spa config lock as writer, you shouldn't
 	 * be asking the DMU to do *anything* unless it's the root pool
 	 * which may require us to read from the root filesystem while
 	 * holding some (not all) of the locks as writer.
 	 */
 	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
 	    (spa_is_root(os->os_spa) &&
 	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
 
 	ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));
 
 	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT ||
 	    object == DMU_PROJECTUSED_OBJECT) {
 		if (object == DMU_USERUSED_OBJECT)
 			dn = DMU_USERUSED_DNODE(os);
 		else if (object == DMU_GROUPUSED_OBJECT)
 			dn = DMU_GROUPUSED_DNODE(os);
 		else
 			dn = DMU_PROJECTUSED_DNODE(os);
 		if (dn == NULL)
 			return (SET_ERROR(ENOENT));
 		type = dn->dn_type;
 		if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
 			return (SET_ERROR(ENOENT));
 		if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
 			return (SET_ERROR(EEXIST));
 		DNODE_VERIFY(dn);
 		/* Don't actually hold if dry run, just return 0 */
 		if (!(flag & DNODE_DRY_RUN)) {
 			(void) zfs_refcount_add(&dn->dn_holds, tag);
 			*dnp = dn;
 		}
 		return (0);
 	}
 
 	if (object == 0 || object >= DN_MAX_OBJECT)
 		return (SET_ERROR(EINVAL));
 
 	mdn = DMU_META_DNODE(os);
 	ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
 
 	DNODE_VERIFY(mdn);
 
 	if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
 		rw_enter(&mdn->dn_struct_rwlock, RW_READER);
 		drop_struct_lock = TRUE;
 	}
 
 	blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
 	db = dbuf_hold(mdn, blk, FTAG);
 	if (drop_struct_lock)
 		rw_exit(&mdn->dn_struct_rwlock);
 	if (db == NULL) {
 		DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
 		return (SET_ERROR(EIO));
 	}
 
 	/*
 	 * We do not need to decrypt to read the dnode so it doesn't matter
 	 * if we get the encrypted or decrypted version.
 	 */
 	err = dbuf_read(db, NULL, DB_RF_CANFAIL |
 	    DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH);
 	if (err) {
 		DNODE_STAT_BUMP(dnode_hold_dbuf_read);
 		dbuf_rele(db, FTAG);
 		return (err);
 	}
 
 	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
 	epb = db->db.db_size >> DNODE_SHIFT;
 
 	idx = object & (epb - 1);
 	dn_block = (dnode_phys_t *)db->db.db_data;
 
 	ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
 	dnc = dmu_buf_get_user(&db->db);
 	dnh = NULL;
 	if (dnc == NULL) {
 		dnode_children_t *winner;
 		int skip = 0;
 
 		dnc = kmem_zalloc(sizeof (dnode_children_t) +
 		    epb * sizeof (dnode_handle_t), KM_SLEEP);
 		dnc->dnc_count = epb;
 		dnh = &dnc->dnc_children[0];
 
 		/* Initialize dnode slot status from dnode_phys_t */
 		for (int i = 0; i < epb; i++) {
 			zrl_init(&dnh[i].dnh_zrlock);
 
 			if (skip) {
 				skip--;
 				continue;
 			}
 
 			if (dn_block[i].dn_type != DMU_OT_NONE) {
 				int interior = dn_block[i].dn_extra_slots;
 
 				dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
 				dnode_set_slots(dnc, i + 1, interior,
 				    DN_SLOT_INTERIOR);
 				skip = interior;
 			} else {
 				dnh[i].dnh_dnode = DN_SLOT_FREE;
 				skip = 0;
 			}
 		}
 
 		dmu_buf_init_user(&dnc->dnc_dbu, NULL,
 		    dnode_buf_evict_async, NULL);
 		winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
 		if (winner != NULL) {
 
 			for (int i = 0; i < epb; i++)
 				zrl_destroy(&dnh[i].dnh_zrlock);
 
 			kmem_free(dnc, sizeof (dnode_children_t) +
 			    epb * sizeof (dnode_handle_t));
 			dnc = winner;
 		}
 	}
 
 	ASSERT(dnc->dnc_count == epb);
 
 	if (flag & DNODE_MUST_BE_ALLOCATED) {
 		slots = 1;
 
 		dnode_slots_hold(dnc, idx, slots);
 		dnh = &dnc->dnc_children[idx];
 
 		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
 			dn = dnh->dnh_dnode;
 		} else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
 			DNODE_STAT_BUMP(dnode_hold_alloc_interior);
 			dnode_slots_rele(dnc, idx, slots);
 			dbuf_rele(db, FTAG);
 			return (SET_ERROR(EEXIST));
 		} else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
 			DNODE_STAT_BUMP(dnode_hold_alloc_misses);
 			dnode_slots_rele(dnc, idx, slots);
 			dbuf_rele(db, FTAG);
 			return (SET_ERROR(ENOENT));
 		} else {
 			dnode_slots_rele(dnc, idx, slots);
 			while (!dnode_slots_tryenter(dnc, idx, slots)) {
 				DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
 				kpreempt(KPREEMPT_SYNC);
 			}
 
 			/*
 			 * Someone else won the race and called dnode_create()
 			 * after we checked DN_SLOT_IS_PTR() above but before
 			 * we acquired the lock.
 			 */
 			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
 				DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
 				dn = dnh->dnh_dnode;
 			} else {
 				dn = dnode_create(os, dn_block + idx, db,
 				    object, dnh);
 				dmu_buf_add_user_size(&db->db,
 				    sizeof (dnode_t));
 			}
 		}
 
 		mutex_enter(&dn->dn_mtx);
 		if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) {
 			DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
 			mutex_exit(&dn->dn_mtx);
 			dnode_slots_rele(dnc, idx, slots);
 			dbuf_rele(db, FTAG);
 			return (SET_ERROR(ENOENT));
 		}
 
 		/* Don't actually hold if dry run, just return 0 */
 		if (flag & DNODE_DRY_RUN) {
 			mutex_exit(&dn->dn_mtx);
 			dnode_slots_rele(dnc, idx, slots);
 			dbuf_rele(db, FTAG);
 			return (0);
 		}
 
 		DNODE_STAT_BUMP(dnode_hold_alloc_hits);
 	} else if (flag & DNODE_MUST_BE_FREE) {
 
 		if (idx + slots - 1 >= DNODES_PER_BLOCK) {
 			DNODE_STAT_BUMP(dnode_hold_free_overflow);
 			dbuf_rele(db, FTAG);
 			return (SET_ERROR(ENOSPC));
 		}
 
 		dnode_slots_hold(dnc, idx, slots);
 
 		if (!dnode_check_slots_free(dnc, idx, slots)) {
 			DNODE_STAT_BUMP(dnode_hold_free_misses);
 			dnode_slots_rele(dnc, idx, slots);
 			dbuf_rele(db, FTAG);
 			return (SET_ERROR(ENOSPC));
 		}
 
 		dnode_slots_rele(dnc, idx, slots);
 		while (!dnode_slots_tryenter(dnc, idx, slots)) {
 			DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
 			kpreempt(KPREEMPT_SYNC);
 		}
 
 		if (!dnode_check_slots_free(dnc, idx, slots)) {
 			DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
 			dnode_slots_rele(dnc, idx, slots);
 			dbuf_rele(db, FTAG);
 			return (SET_ERROR(ENOSPC));
 		}
 
 		/*
 		 * Allocated but otherwise free dnodes which would
 		 * be in the interior of a multi-slot dnodes need
 		 * to be freed.  Single slot dnodes can be safely
 		 * re-purposed as a performance optimization.
 		 */
 		if (slots > 1) {
 			uint_t reclaimed =
 			    dnode_reclaim_slots(dnc, idx + 1, slots - 1);
 			if (reclaimed > 0)
 				dmu_buf_sub_user_size(&db->db,
 				    reclaimed * sizeof (dnode_t));
 		}
 
 		dnh = &dnc->dnc_children[idx];
 		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
 			dn = dnh->dnh_dnode;
 		} else {
 			dn = dnode_create(os, dn_block + idx, db,
 			    object, dnh);
 			dmu_buf_add_user_size(&db->db, sizeof (dnode_t));
 		}
 
 		mutex_enter(&dn->dn_mtx);
 		if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) {
 			DNODE_STAT_BUMP(dnode_hold_free_refcount);
 			mutex_exit(&dn->dn_mtx);
 			dnode_slots_rele(dnc, idx, slots);
 			dbuf_rele(db, FTAG);
 			return (SET_ERROR(EEXIST));
 		}
 
 		/* Don't actually hold if dry run, just return 0 */
 		if (flag & DNODE_DRY_RUN) {
 			mutex_exit(&dn->dn_mtx);
 			dnode_slots_rele(dnc, idx, slots);
 			dbuf_rele(db, FTAG);
 			return (0);
 		}
 
 		dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
 		DNODE_STAT_BUMP(dnode_hold_free_hits);
 	} else {
 		dbuf_rele(db, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	ASSERT0(dn->dn_free_txg);
 
 	if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
 		dbuf_add_ref(db, dnh);
 
 	mutex_exit(&dn->dn_mtx);
 
 	/* Now we can rely on the hold to prevent the dnode from moving. */
 	dnode_slots_rele(dnc, idx, slots);
 
 	DNODE_VERIFY(dn);
 	ASSERT3P(dnp, !=, NULL);
 	ASSERT3P(dn->dn_dbuf, ==, db);
 	ASSERT3U(dn->dn_object, ==, object);
 	dbuf_rele(db, FTAG);
 
 	*dnp = dn;
 	return (0);
 }
 
 /*
  * Return held dnode if the object is allocated, NULL if not.
  */
 int
 dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp)
 {
 	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
 	    dnp));
 }
 
 /*
  * Can only add a reference if there is already at least one
  * reference on the dnode.  Returns FALSE if unable to add a
  * new reference.
  */
 boolean_t
 dnode_add_ref(dnode_t *dn, const void *tag)
 {
 	mutex_enter(&dn->dn_mtx);
 	if (zfs_refcount_is_zero(&dn->dn_holds)) {
 		mutex_exit(&dn->dn_mtx);
 		return (FALSE);
 	}
 	VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
 	mutex_exit(&dn->dn_mtx);
 	return (TRUE);
 }
 
 void
 dnode_rele(dnode_t *dn, const void *tag)
 {
 	mutex_enter(&dn->dn_mtx);
 	dnode_rele_and_unlock(dn, tag, B_FALSE);
 }
 
 void
 dnode_rele_and_unlock(dnode_t *dn, const void *tag, boolean_t evicting)
 {
 	uint64_t refs;
 	/* Get while the hold prevents the dnode from moving. */
 	dmu_buf_impl_t *db = dn->dn_dbuf;
 	dnode_handle_t *dnh = dn->dn_handle;
 
 	refs = zfs_refcount_remove(&dn->dn_holds, tag);
 	if (refs == 0)
 		cv_broadcast(&dn->dn_nodnholds);
 	mutex_exit(&dn->dn_mtx);
 	/* dnode could get destroyed at this point, so don't use it anymore */
 
 	/*
 	 * It's unsafe to release the last hold on a dnode by dnode_rele() or
 	 * indirectly by dbuf_rele() while relying on the dnode handle to
 	 * prevent the dnode from moving, since releasing the last hold could
 	 * result in the dnode's parent dbuf evicting its dnode handles. For
 	 * that reason anyone calling dnode_rele() or dbuf_rele() without some
 	 * other direct or indirect hold on the dnode must first drop the dnode
 	 * handle.
 	 */
 #ifdef ZFS_DEBUG
 	ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
 #endif
 
 	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
 	if (refs == 0 && db != NULL) {
 		/*
 		 * Another thread could add a hold to the dnode handle in
 		 * dnode_hold_impl() while holding the parent dbuf. Since the
 		 * hold on the parent dbuf prevents the handle from being
 		 * destroyed, the hold on the handle is OK. We can't yet assert
 		 * that the handle has zero references, but that will be
 		 * asserted anyway when the handle gets destroyed.
 		 */
 		mutex_enter(&db->db_mtx);
 		dbuf_rele_and_unlock(db, dnh, evicting);
 	}
 }
 
 /*
  * Test whether we can create a dnode at the specified location.
  */
 int
 dnode_try_claim(objset_t *os, uint64_t object, int slots)
 {
 	return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN,
 	    slots, NULL, NULL));
 }
 
 /*
  * Checks if the dnode itself is dirty, or is carrying any uncommitted records.
  * It is important to check both conditions, as some operations (eg appending
  * to a file) can dirty both as a single logical unit, but they are not synced
  * out atomically, so checking one and not the other can result in an object
  * appearing to be clean mid-way through a commit.
  *
  * Do not change this lightly! If you get it wrong, dmu_offset_next() can
  * detect a hole where there is really data, leading to silent corruption.
  */
 boolean_t
 dnode_is_dirty(dnode_t *dn)
 {
 	mutex_enter(&dn->dn_mtx);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		if (multilist_link_active(&dn->dn_dirty_link[i]) ||
 		    !list_is_empty(&dn->dn_dirty_records[i])) {
 			mutex_exit(&dn->dn_mtx);
 			return (B_TRUE);
 		}
 	}
 
 	mutex_exit(&dn->dn_mtx);
 
 	return (B_FALSE);
 }
 
 void
 dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 {
 	objset_t *os = dn->dn_objset;
 	uint64_t txg = tx->tx_txg;
 
 	if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 		dsl_dataset_dirty(os->os_dsl_dataset, tx);
 		return;
 	}
 
 	DNODE_VERIFY(dn);
 
 #ifdef ZFS_DEBUG
 	mutex_enter(&dn->dn_mtx);
 	ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
 	ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
 	mutex_exit(&dn->dn_mtx);
 #endif
 
 	/*
 	 * Determine old uid/gid when necessary
 	 */
 	dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
 
 	multilist_t *dirtylist = &os->os_dirty_dnodes[txg & TXG_MASK];
 	multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
 
 	/*
 	 * If we are already marked dirty, we're done.
 	 */
 	if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
 		multilist_sublist_unlock(mls);
 		return;
 	}
 
 	ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) ||
 	    !avl_is_empty(&dn->dn_dbufs));
 	ASSERT(dn->dn_datablksz != 0);
 	ASSERT0(dn->dn_next_bonuslen[txg & TXG_MASK]);
 	ASSERT0(dn->dn_next_blksz[txg & TXG_MASK]);
 	ASSERT0(dn->dn_next_bonustype[txg & TXG_MASK]);
 
 	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
 	    (u_longlong_t)dn->dn_object, (u_longlong_t)txg);
 
 	multilist_sublist_insert_head(mls, dn);
 
 	multilist_sublist_unlock(mls);
 
 	/*
 	 * The dnode maintains a hold on its containing dbuf as
 	 * long as there are holds on it.  Each instantiated child
 	 * dbuf maintains a hold on the dnode.  When the last child
 	 * drops its hold, the dnode will drop its hold on the
 	 * containing dbuf. We add a "dirty hold" here so that the
 	 * dnode will hang around after we finish processing its
 	 * children.
 	 */
 	VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
 
 	(void) dbuf_dirty(dn->dn_dbuf, tx);
 
 	dsl_dataset_dirty(os->os_dsl_dataset, tx);
 }
 
 void
 dnode_free(dnode_t *dn, dmu_tx_t *tx)
 {
 	mutex_enter(&dn->dn_mtx);
 	if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
 		mutex_exit(&dn->dn_mtx);
 		return;
 	}
 	dn->dn_free_txg = tx->tx_txg;
 	mutex_exit(&dn->dn_mtx);
 
 	dnode_setdirty(dn, tx);
 }
 
 /*
  * Try to change the block size for the indicated dnode.  This can only
  * succeed if there are no blocks allocated or dirty beyond first block
  */
 int
 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db;
 	int err;
 
 	ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	if (size == 0)
 		size = SPA_MINBLOCKSIZE;
 	else
 		size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
 
 	if (ibs == dn->dn_indblkshift)
 		ibs = 0;
 
 	if (size == dn->dn_datablksz && ibs == 0)
 		return (0);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 
 	/* Check for any allocated blocks beyond the first */
 	if (dn->dn_maxblkid != 0)
 		goto fail;
 
 	mutex_enter(&dn->dn_dbufs_mtx);
 	for (db = avl_first(&dn->dn_dbufs); db != NULL;
 	    db = AVL_NEXT(&dn->dn_dbufs, db)) {
 		if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
 		    db->db_blkid != DMU_SPILL_BLKID) {
 			mutex_exit(&dn->dn_dbufs_mtx);
 			goto fail;
 		}
 	}
 	mutex_exit(&dn->dn_dbufs_mtx);
 
 	if (ibs && dn->dn_nlevels != 1)
 		goto fail;
 
 	dnode_setdirty(dn, tx);
 	if (size != dn->dn_datablksz) {
 		/* resize the old block */
 		err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
 		if (err == 0) {
 			dbuf_new_size(db, size, tx);
 		} else if (err != ENOENT) {
 			goto fail;
 		}
 
 		dnode_setdblksz(dn, size);
 		dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size;
 		if (db)
 			dbuf_rele(db, FTAG);
 	}
 	if (ibs) {
 		dn->dn_indblkshift = ibs;
 		dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
 	return (0);
 
 fail:
 	rw_exit(&dn->dn_struct_rwlock);
 	return (SET_ERROR(ENOTSUP));
 }
 
 static void
 dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx)
 {
 	uint64_t txgoff = tx->tx_txg & TXG_MASK;
 	int old_nlevels = dn->dn_nlevels;
 	dmu_buf_impl_t *db;
 	list_t *list;
 	dbuf_dirty_record_t *new, *dr, *dr_next;
 
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	ASSERT3U(new_nlevels, >, dn->dn_nlevels);
 	dn->dn_nlevels = new_nlevels;
 
 	ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
 	dn->dn_next_nlevels[txgoff] = new_nlevels;
 
 	/* dirty the left indirects */
 	db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
 	ASSERT(db != NULL);
 	new = dbuf_dirty(db, tx);
 	dbuf_rele(db, FTAG);
 
 	/* transfer the dirty records to the new indirect */
 	mutex_enter(&dn->dn_mtx);
 	mutex_enter(&new->dt.di.dr_mtx);
 	list = &dn->dn_dirty_records[txgoff];
 	for (dr = list_head(list); dr; dr = dr_next) {
 		dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
 
 		IMPLY(dr->dr_dbuf == NULL, old_nlevels == 1);
 		if (dr->dr_dbuf == NULL ||
 		    (dr->dr_dbuf->db_level == old_nlevels - 1 &&
 		    dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
 		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID)) {
 			list_remove(&dn->dn_dirty_records[txgoff], dr);
 			list_insert_tail(&new->dt.di.dr_children, dr);
 			dr->dr_parent = new;
 		}
 	}
 	mutex_exit(&new->dt.di.dr_mtx);
 	mutex_exit(&dn->dn_mtx);
 }
 
 int
 dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx)
 {
 	int ret = 0;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 
 	if (dn->dn_nlevels == nlevels) {
 		ret = 0;
 		goto out;
 	} else if (nlevels < dn->dn_nlevels) {
 		ret = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	dnode_set_nlevels_impl(dn, nlevels, tx);
 
 out:
 	rw_exit(&dn->dn_struct_rwlock);
 	return (ret);
 }
 
 /* read-holding callers must not rely on the lock being continuously held */
 void
 dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read,
     boolean_t force)
 {
 	int epbs, new_nlevels;
 	uint64_t sz;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 
 	ASSERT(have_read ?
 	    RW_READ_HELD(&dn->dn_struct_rwlock) :
 	    RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	/*
 	 * if we have a read-lock, check to see if we need to do any work
 	 * before upgrading to a write-lock.
 	 */
 	if (have_read) {
 		if (blkid <= dn->dn_maxblkid)
 			return;
 
 		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
 			rw_exit(&dn->dn_struct_rwlock);
 			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		}
 	}
 
 	/*
 	 * Raw sends (indicated by the force flag) require that we take the
 	 * given blkid even if the value is lower than the current value.
 	 */
 	if (!force && blkid <= dn->dn_maxblkid)
 		goto out;
 
 	/*
 	 * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff]
 	 * to indicate that this field is set. This allows us to set the
 	 * maxblkid to 0 on an existing object in dnode_sync().
 	 */
 	dn->dn_maxblkid = blkid;
 	dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] =
 	    blkid | DMU_NEXT_MAXBLKID_SET;
 
 	/*
 	 * Compute the number of levels necessary to support the new maxblkid.
 	 * Raw sends will ensure nlevels is set correctly for us.
 	 */
 	new_nlevels = 1;
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 	for (sz = dn->dn_nblkptr;
 	    sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
 		new_nlevels++;
 
 	ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);
 
 	if (!force) {
 		if (new_nlevels > dn->dn_nlevels)
 			dnode_set_nlevels_impl(dn, new_nlevels, tx);
 	} else {
 		ASSERT3U(dn->dn_nlevels, >=, new_nlevels);
 	}
 
 out:
 	if (have_read)
 		rw_downgrade(&dn->dn_struct_rwlock);
 }
 
 static void
 dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
 	if (db != NULL) {
 		dmu_buf_will_dirty(&db->db, tx);
 		dbuf_rele(db, FTAG);
 	}
 }
 
 /*
  * Dirty all the in-core level-1 dbufs in the range specified by start_blkid
  * and end_blkid.
  */
 static void
 dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db_search;
 	dmu_buf_impl_t *db;
 	avl_index_t where;
 
 	db_search = kmem_zalloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
 
 	mutex_enter(&dn->dn_dbufs_mtx);
 
 	db_search->db_level = 1;
 	db_search->db_blkid = start_blkid + 1;
 	db_search->db_state = DB_SEARCH;
 	for (;;) {
 
 		db = avl_find(&dn->dn_dbufs, db_search, &where);
 		if (db == NULL)
 			db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
 
 		if (db == NULL || db->db_level != 1 ||
 		    db->db_blkid >= end_blkid) {
 			break;
 		}
 
 		/*
 		 * Setup the next blkid we want to search for.
 		 */
 		db_search->db_blkid = db->db_blkid + 1;
 		ASSERT3U(db->db_blkid, >=, start_blkid);
 
 		/*
 		 * If the dbuf transitions to DB_EVICTING while we're trying
 		 * to dirty it, then we will be unable to discover it in
 		 * the dbuf hash table. This will result in a call to
 		 * dbuf_create() which needs to acquire the dn_dbufs_mtx
 		 * lock. To avoid a deadlock, we drop the lock before
 		 * dirtying the level-1 dbuf.
 		 */
 		mutex_exit(&dn->dn_dbufs_mtx);
 		dnode_dirty_l1(dn, db->db_blkid, tx);
 		mutex_enter(&dn->dn_dbufs_mtx);
 	}
 
 #ifdef ZFS_DEBUG
 	/*
 	 * Walk all the in-core level-1 dbufs and verify they have been dirtied.
 	 */
 	db_search->db_level = 1;
 	db_search->db_blkid = start_blkid + 1;
 	db_search->db_state = DB_SEARCH;
 	db = avl_find(&dn->dn_dbufs, db_search, &where);
 	if (db == NULL)
 		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
 	for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
 		if (db->db_level != 1 || db->db_blkid >= end_blkid)
 			break;
 		if (db->db_state != DB_EVICTING)
 			ASSERT(db->db_dirtycnt > 0);
 	}
 #endif
 	kmem_free(db_search, sizeof (dmu_buf_impl_t));
 	mutex_exit(&dn->dn_dbufs_mtx);
 }
 
 void
 dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag)
 {
 	/*
 	 * Don't set dirtyctx to SYNC if we're just modifying this as we
 	 * initialize the objset.
 	 */
 	if (dn->dn_dirtyctx == DN_UNDIRTIED) {
 		dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 
 		if (ds != NULL) {
 			rrw_enter(&ds->ds_bp_rwlock, RW_READER, tag);
 		}
 		if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
 			if (dmu_tx_is_syncing(tx))
 				dn->dn_dirtyctx = DN_DIRTY_SYNC;
 			else
 				dn->dn_dirtyctx = DN_DIRTY_OPEN;
 			dn->dn_dirtyctx_firstset = tag;
 		}
 		if (ds != NULL) {
 			rrw_exit(&ds->ds_bp_rwlock, tag);
 		}
 	}
 }
 
 static void
 dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db;
 	int res;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), TRUE, FALSE,
 	    FTAG, &db);
 	rw_exit(&dn->dn_struct_rwlock);
 	if (res == 0) {
 		db_lock_type_t dblt;
 		boolean_t dirty;
 
 		dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
 		/* don't dirty if not on disk and not dirty */
 		dirty = !list_is_empty(&db->db_dirty_records) ||
 		    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
 		dmu_buf_unlock_parent(db, dblt, FTAG);
 		if (dirty) {
 			caddr_t data;
 
 			dmu_buf_will_dirty(&db->db, tx);
 			data = db->db.db_data;
 			memset(data + blkoff, 0, len);
 		}
 		dbuf_rele(db, FTAG);
 	}
 }
 
 void
 dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 {
 	uint64_t blkoff, blkid, nblks;
 	int blksz, blkshift, head, tail;
 	int trunc = FALSE;
 	int epbs;
 
 	blksz = dn->dn_datablksz;
 	blkshift = dn->dn_datablkshift;
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 	if (len == DMU_OBJECT_END) {
 		len = UINT64_MAX - off;
 		trunc = TRUE;
 	}
 
 	/*
 	 * First, block align the region to free:
 	 */
 	if (ISP2(blksz)) {
 		head = P2NPHASE(off, blksz);
 		blkoff = P2PHASE(off, blksz);
 		if ((off >> blkshift) > dn->dn_maxblkid)
 			return;
 	} else {
 		ASSERT(dn->dn_maxblkid == 0);
 		if (off == 0 && len >= blksz) {
 			/*
 			 * Freeing the whole block; fast-track this request.
 			 */
 			blkid = 0;
 			nblks = 1;
 			if (dn->dn_nlevels > 1) {
 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 				dnode_dirty_l1(dn, 0, tx);
 				rw_exit(&dn->dn_struct_rwlock);
 			}
 			goto done;
 		} else if (off >= blksz) {
 			/* Freeing past end-of-data */
 			return;
 		} else {
 			/* Freeing part of the block. */
 			head = blksz - off;
 			ASSERT3U(head, >, 0);
 		}
 		blkoff = off;
 	}
 	/* zero out any partial block data at the start of the range */
 	if (head) {
 		ASSERT3U(blkoff + head, ==, blksz);
 		if (len < head)
 			head = len;
 		dnode_partial_zero(dn, off, blkoff, head, tx);
 		off += head;
 		len -= head;
 	}
 
 	/* If the range was less than one block, we're done */
 	if (len == 0)
 		return;
 
 	/* If the remaining range is past end of file, we're done */
 	if ((off >> blkshift) > dn->dn_maxblkid)
 		return;
 
 	ASSERT(ISP2(blksz));
 	if (trunc)
 		tail = 0;
 	else
 		tail = P2PHASE(len, blksz);
 
 	ASSERT0(P2PHASE(off, blksz));
 	/* zero out any partial block data at the end of the range */
 	if (tail) {
 		if (len < tail)
 			tail = len;
 		dnode_partial_zero(dn, off + len, 0, tail, tx);
 		len -= tail;
 	}
 
 	/* If the range did not include a full block, we are done */
 	if (len == 0)
 		return;
 
 	ASSERT(IS_P2ALIGNED(off, blksz));
 	ASSERT(trunc || IS_P2ALIGNED(len, blksz));
 	blkid = off >> blkshift;
 	nblks = len >> blkshift;
 	if (trunc)
 		nblks += 1;
 
 	/*
 	 * Dirty all the indirect blocks in this range.  Note that only
 	 * the first and last indirect blocks can actually be written
 	 * (if they were partially freed) -- they must be dirtied, even if
 	 * they do not exist on disk yet.  The interior blocks will
 	 * be freed by free_children(), so they will not actually be written.
 	 * Even though these interior blocks will not be written, we
 	 * dirty them for two reasons:
 	 *
 	 *  - It ensures that the indirect blocks remain in memory until
 	 *    syncing context.  (They have already been prefetched by
 	 *    dmu_tx_hold_free(), so we don't have to worry about reading
 	 *    them serially here.)
 	 *
 	 *  - The dirty space accounting will put pressure on the txg sync
 	 *    mechanism to begin syncing, and to delay transactions if there
 	 *    is a large amount of freeing.  Even though these indirect
 	 *    blocks will not be written, we could need to write the same
 	 *    amount of space if we copy the freed BPs into deadlists.
 	 */
 	if (dn->dn_nlevels > 1) {
 		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		uint64_t first, last;
 
 		first = blkid >> epbs;
 		dnode_dirty_l1(dn, first, tx);
 		if (trunc)
 			last = dn->dn_maxblkid >> epbs;
 		else
 			last = (blkid + nblks - 1) >> epbs;
 		if (last != first)
 			dnode_dirty_l1(dn, last, tx);
 
 		dnode_dirty_l1range(dn, first, last, tx);
 
 		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 		    SPA_BLKPTRSHIFT;
 		for (uint64_t i = first + 1; i < last; i++) {
 			/*
 			 * Set i to the blockid of the next non-hole
 			 * level-1 indirect block at or after i.  Note
 			 * that dnode_next_offset() operates in terms of
 			 * level-0-equivalent bytes.
 			 */
 			uint64_t ibyte = i << shift;
 			int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
 			    &ibyte, 2, 1, 0);
 			i = ibyte >> shift;
 			if (i >= last)
 				break;
 
 			/*
 			 * Normally we should not see an error, either
 			 * from dnode_next_offset() or dbuf_hold_level()
 			 * (except for ESRCH from dnode_next_offset).
 			 * If there is an i/o error, then when we read
 			 * this block in syncing context, it will use
 			 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
 			 * to the "failmode" property.  dnode_next_offset()
 			 * doesn't have a flag to indicate MUSTSUCCEED.
 			 */
 			if (err != 0)
 				break;
 
 			dnode_dirty_l1(dn, i, tx);
 		}
 		rw_exit(&dn->dn_struct_rwlock);
 	}
 
 done:
 	/*
 	 * Add this range to the dnode range list.
 	 * We will finish up this free operation in the syncing phase.
 	 */
 	mutex_enter(&dn->dn_mtx);
 	{
 		int txgoff = tx->tx_txg & TXG_MASK;
 		if (dn->dn_free_ranges[txgoff] == NULL) {
 			dn->dn_free_ranges[txgoff] = range_tree_create(NULL,
 			    RANGE_SEG64, NULL, 0, 0);
 		}
 		range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
 		range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
 	}
 	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
 	    (u_longlong_t)blkid, (u_longlong_t)nblks,
 	    (u_longlong_t)tx->tx_txg);
 	mutex_exit(&dn->dn_mtx);
 
 	dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
 	dnode_setdirty(dn, tx);
 }
 
 static boolean_t
 dnode_spill_freed(dnode_t *dn)
 {
 	int i;
 
 	mutex_enter(&dn->dn_mtx);
 	for (i = 0; i < TXG_SIZE; i++) {
 		if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
 			break;
 	}
 	mutex_exit(&dn->dn_mtx);
 	return (i < TXG_SIZE);
 }
 
 /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
 uint64_t
 dnode_block_freed(dnode_t *dn, uint64_t blkid)
 {
 	int i;
 
 	if (blkid == DMU_BONUS_BLKID)
 		return (FALSE);
 
 	if (dn->dn_free_txg)
 		return (TRUE);
 
 	if (blkid == DMU_SPILL_BLKID)
 		return (dnode_spill_freed(dn));
 
 	mutex_enter(&dn->dn_mtx);
 	for (i = 0; i < TXG_SIZE; i++) {
 		if (dn->dn_free_ranges[i] != NULL &&
 		    range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
 			break;
 	}
 	mutex_exit(&dn->dn_mtx);
 	return (i < TXG_SIZE);
 }
 
 /* call from syncing context when we actually write/free space for this dnode */
 void
 dnode_diduse_space(dnode_t *dn, int64_t delta)
 {
 	uint64_t space;
 	dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
 	    dn, dn->dn_phys,
 	    (u_longlong_t)dn->dn_phys->dn_used,
 	    (longlong_t)delta);
 
 	mutex_enter(&dn->dn_mtx);
 	space = DN_USED_BYTES(dn->dn_phys);
 	if (delta > 0) {
 		ASSERT3U(space + delta, >=, space); /* no overflow */
 	} else {
 		ASSERT3U(space, >=, -delta); /* no underflow */
 	}
 	space += delta;
 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
 		ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
 		ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
 		dn->dn_phys->dn_used = space >> DEV_BSHIFT;
 	} else {
 		dn->dn_phys->dn_used = space;
 		dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
 	}
 	mutex_exit(&dn->dn_mtx);
 }
 
 /*
  * Scans a block at the indicated "level" looking for a hole or data,
  * depending on 'flags'.
  *
  * If level > 0, then we are scanning an indirect block looking at its
  * pointers.  If level == 0, then we are looking at a block of dnodes.
  *
  * If we don't find what we are looking for in the block, we return ESRCH.
  * Otherwise, return with *offset pointing to the beginning (if searching
  * forwards) or end (if searching backwards) of the range covered by the
  * block pointer we matched on (or dnode).
  *
  * The basic search algorithm used below by dnode_next_offset() is to
  * use this function to search up the block tree (widen the search) until
  * we find something (i.e., we don't return ESRCH) and then search back
  * down the tree (narrow the search) until we reach our original search
  * level.
  */
 static int
 dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
     int lvl, uint64_t blkfill, uint64_t txg)
 {
 	dmu_buf_impl_t *db = NULL;
 	void *data = NULL;
 	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	uint64_t epb = 1ULL << epbs;
 	uint64_t minfill, maxfill;
 	boolean_t hole;
 	int i, inc, error, span;
 
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 
 	hole = ((flags & DNODE_FIND_HOLE) != 0);
 	inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
 	ASSERT(txg == 0 || !hole);
 
 	if (lvl == dn->dn_phys->dn_nlevels) {
 		error = 0;
 		epb = dn->dn_phys->dn_nblkptr;
 		data = dn->dn_phys->dn_blkptr;
 	} else {
 		uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
 		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
 		if (error) {
 			if (error != ENOENT)
 				return (error);
 			if (hole)
 				return (0);
 			/*
 			 * This can only happen when we are searching up
 			 * the block tree for data.  We don't really need to
 			 * adjust the offset, as we will just end up looking
 			 * at the pointer to this block in its parent, and its
 			 * going to be unallocated, so we will skip over it.
 			 */
 			return (SET_ERROR(ESRCH));
 		}
 		error = dbuf_read(db, NULL,
 		    DB_RF_CANFAIL | DB_RF_HAVESTRUCT |
 		    DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH);
 		if (error) {
 			dbuf_rele(db, FTAG);
 			return (error);
 		}
 		data = db->db.db_data;
 		rw_enter(&db->db_rwlock, RW_READER);
 	}
 
 	if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
-	    db->db_blkptr->blk_birth <= txg ||
+	    BP_GET_LOGICAL_BIRTH(db->db_blkptr) <= txg ||
 	    BP_IS_HOLE(db->db_blkptr))) {
 		/*
 		 * This can only happen when we are searching up the tree
 		 * and these conditions mean that we need to keep climbing.
 		 */
 		error = SET_ERROR(ESRCH);
 	} else if (lvl == 0) {
 		dnode_phys_t *dnp = data;
 
 		ASSERT(dn->dn_type == DMU_OT_DNODE);
 		ASSERT(!(flags & DNODE_FIND_BACKWARDS));
 
 		for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
 		    i < blkfill; i += dnp[i].dn_extra_slots + 1) {
 			if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
 				break;
 		}
 
 		if (i == blkfill)
 			error = SET_ERROR(ESRCH);
 
 		*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
 		    (i << DNODE_SHIFT);
 	} else {
 		blkptr_t *bp = data;
 		uint64_t start = *offset;
 		span = (lvl - 1) * epbs + dn->dn_datablkshift;
 		minfill = 0;
 		maxfill = blkfill << ((lvl - 1) * epbs);
 
 		if (hole)
 			maxfill--;
 		else
 			minfill++;
 
 		if (span >= 8 * sizeof (*offset)) {
 			/* This only happens on the highest indirection level */
 			ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1);
 			*offset = 0;
 		} else {
 			*offset = *offset >> span;
 		}
 
 		for (i = BF64_GET(*offset, 0, epbs);
 		    i >= 0 && i < epb; i += inc) {
 			if (BP_GET_FILL(&bp[i]) >= minfill &&
 			    BP_GET_FILL(&bp[i]) <= maxfill &&
-			    (hole || bp[i].blk_birth > txg))
+			    (hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg))
 				break;
 			if (inc > 0 || *offset > 0)
 				*offset += inc;
 		}
 
 		if (span >= 8 * sizeof (*offset)) {
 			*offset = start;
 		} else {
 			*offset = *offset << span;
 		}
 
 		if (inc < 0) {
 			/* traversing backwards; position offset at the end */
 			if (span < 8 * sizeof (*offset))
 				*offset = MIN(*offset + (1ULL << span) - 1,
 				    start);
 		} else if (*offset < start) {
 			*offset = start;
 		}
 		if (i < 0 || i >= epb)
 			error = SET_ERROR(ESRCH);
 	}
 
 	if (db != NULL) {
 		rw_exit(&db->db_rwlock);
 		dbuf_rele(db, FTAG);
 	}
 
 	return (error);
 }
 
 /*
  * Find the next hole, data, or sparse region at or after *offset.
  * The value 'blkfill' tells us how many items we expect to find
  * in an L0 data block; this value is 1 for normal objects,
  * DNODES_PER_BLOCK for the meta dnode, and some fraction of
  * DNODES_PER_BLOCK when searching for sparse regions thereof.
  *
  * Examples:
  *
  * dnode_next_offset(dn, flags, offset, 1, 1, 0);
  *	Finds the next/previous hole/data in a file.
  *	Used in dmu_offset_next().
  *
  * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
  *	Finds the next free/allocated dnode an objset's meta-dnode.
  *	Only finds objects that have new contents since txg (ie.
  *	bonus buffer changes and content removal are ignored).
  *	Used in dmu_object_next().
  *
  * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
  *	Finds the next L2 meta-dnode bp that's at most 1/4 full.
  *	Used in dmu_object_alloc().
  */
 int
 dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
     int minlvl, uint64_t blkfill, uint64_t txg)
 {
 	uint64_t initial_offset = *offset;
 	int lvl, maxlvl;
 	int error = 0;
 
 	if (!(flags & DNODE_FIND_HAVELOCK))
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	if (dn->dn_phys->dn_nlevels == 0) {
 		error = SET_ERROR(ESRCH);
 		goto out;
 	}
 
 	if (dn->dn_datablkshift == 0) {
 		if (*offset < dn->dn_datablksz) {
 			if (flags & DNODE_FIND_HOLE)
 				*offset = dn->dn_datablksz;
 		} else {
 			error = SET_ERROR(ESRCH);
 		}
 		goto out;
 	}
 
 	maxlvl = dn->dn_phys->dn_nlevels;
 
 	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
 		error = dnode_next_offset_level(dn,
 		    flags, offset, lvl, blkfill, txg);
 		if (error != ESRCH)
 			break;
 	}
 
 	while (error == 0 && --lvl >= minlvl) {
 		error = dnode_next_offset_level(dn,
 		    flags, offset, lvl, blkfill, txg);
 	}
 
 	/*
 	 * There's always a "virtual hole" at the end of the object, even
 	 * if all BP's which physically exist are non-holes.
 	 */
 	if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
 	    minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
 		error = 0;
 	}
 
 	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
 	    initial_offset < *offset : initial_offset > *offset))
 		error = SET_ERROR(ESRCH);
 out:
 	if (!(flags & DNODE_FIND_HAVELOCK))
 		rw_exit(&dn->dn_struct_rwlock);
 
 	return (error);
 }
 
 #if defined(_KERNEL)
 EXPORT_SYMBOL(dnode_hold);
 EXPORT_SYMBOL(dnode_rele);
 EXPORT_SYMBOL(dnode_set_nlevels);
 EXPORT_SYMBOL(dnode_set_blksz);
 EXPORT_SYMBOL(dnode_free_range);
 EXPORT_SYMBOL(dnode_evict_dbufs);
 EXPORT_SYMBOL(dnode_evict_bonus);
 #endif
 
 ZFS_MODULE_PARAM(zfs, zfs_, default_bs, INT, ZMOD_RW,
 	"Default dnode block shift");
 ZFS_MODULE_PARAM(zfs, zfs_, default_ibs, INT, ZMOD_RW,
 	"Default dnode indirect block shift");
diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c
index 4faefecbadbb..5fd8bc2a2682 100644
--- a/module/zfs/dsl_bookmark.c
+++ b/module/zfs/dsl_bookmark.c
@@ -1,1769 +1,1770 @@
 /*
  * CDDL HEADER START
  *
  * This file and its contents are supplied under the terms of the
  * Common Development and Distribution License ("CDDL"), version 1.0.
  * You may only use this file in accordance with the terms of version
  * 1.0 of the CDDL.
  *
  * A full copy of the text of the CDDL should have accompanied this
  * source.  A copy of the CDDL is also available via the Internet at
  * http://www.illumos.org/license/CDDL.
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright 2019, 2020 by Christian Schwarz. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 #include <sys/spa.h>
 #include <sys/dsl_bookmark.h>
 #include <zfs_namecheck.h>
 #include <sys/dmu_send.h>
 #include <sys/dbuf.h>
 
 static int
 dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname,
     dsl_dataset_t **dsp, const void *tag, char **shortnamep)
 {
 	char buf[ZFS_MAX_DATASET_NAME_LEN];
 	char *hashp;
 
 	if (strlen(fullname) >= ZFS_MAX_DATASET_NAME_LEN)
 		return (SET_ERROR(ENAMETOOLONG));
 	hashp = strchr(fullname, '#');
 	if (hashp == NULL)
 		return (SET_ERROR(EINVAL));
 
 	*shortnamep = hashp + 1;
 	if (zfs_component_namecheck(*shortnamep, NULL, NULL))
 		return (SET_ERROR(EINVAL));
 	(void) strlcpy(buf, fullname, hashp - fullname + 1);
 	return (dsl_dataset_hold(dp, buf, tag, dsp));
 }
 
 /*
  * When reading BOOKMARK_V1 bookmarks, the BOOKMARK_V2 fields are guaranteed
  * to be zeroed.
  *
  * Returns ESRCH if bookmark is not found.
  * Note, we need to use the ZAP rather than the AVL to look up bookmarks
  * by name, because only the ZAP honors the casesensitivity setting.
  */
 int
 dsl_bookmark_lookup_impl(dsl_dataset_t *ds, const char *shortname,
     zfs_bookmark_phys_t *bmark_phys)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t bmark_zapobj = ds->ds_bookmarks_obj;
 	matchtype_t mt = 0;
 	int err;
 
 	if (bmark_zapobj == 0)
 		return (SET_ERROR(ESRCH));
 
 	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_NORMALIZE;
 
 	/*
 	 * Zero out the bookmark in case the one stored on disk
 	 * is in an older, shorter format.
 	 */
 	memset(bmark_phys, 0, sizeof (*bmark_phys));
 
 	err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t),
 	    sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, NULL, 0,
 	    NULL);
 
 	return (err == ENOENT ? SET_ERROR(ESRCH) : err);
 }
 
 /*
  * If later_ds is non-NULL, this will return EXDEV if the specified bookmark
  * does not represents an earlier point in later_ds's timeline.  However,
  * bmp will still be filled in if we return EXDEV.
  *
  * Returns ENOENT if the dataset containing the bookmark does not exist.
  * Returns ESRCH if the dataset exists but the bookmark was not found in it.
  */
 int
 dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname,
     dsl_dataset_t *later_ds, zfs_bookmark_phys_t *bmp)
 {
 	char *shortname;
 	dsl_dataset_t *ds;
 	int error;
 
 	error = dsl_bookmark_hold_ds(dp, fullname, &ds, FTAG, &shortname);
 	if (error != 0)
 		return (error);
 
 	error = dsl_bookmark_lookup_impl(ds, shortname, bmp);
 	if (error == 0 && later_ds != NULL) {
 		if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg))
 			error = SET_ERROR(EXDEV);
 	}
 	dsl_dataset_rele(ds, FTAG);
 	return (error);
 }
 
 /*
  * Validates that
  * - bmark is a full dataset path of a bookmark (bookmark_namecheck)
  * - source is a full path of a snapshot or bookmark
  *   ({bookmark,snapshot}_namecheck)
  *
  * Returns 0 if valid, -1 otherwise.
  */
 static int
 dsl_bookmark_create_nvl_validate_pair(const char *bmark, const char *source)
 {
 	if (bookmark_namecheck(bmark, NULL, NULL) != 0)
 		return (-1);
 
 	int is_bmark, is_snap;
 	is_bmark = bookmark_namecheck(source, NULL, NULL) == 0;
 	is_snap = snapshot_namecheck(source, NULL, NULL) == 0;
 	if (!is_bmark && !is_snap)
 		return (-1);
 
 	return (0);
 }
 
 /*
  * Check that the given nvlist corresponds to the following schema:
  *  { newbookmark -> source, ... }
  * where
  * - each pair passes dsl_bookmark_create_nvl_validate_pair
  * - all newbookmarks are in the same pool
  * - all newbookmarks have unique names
  *
  * Note that this function is only validates above schema. Callers must ensure
  * that the bookmarks can be created, e.g. that sources exist.
  *
  * Returns 0 if the nvlist adheres to above schema.
  * Returns -1 if it doesn't.
  */
 int
 dsl_bookmark_create_nvl_validate(nvlist_t *bmarks)
 {
 	const char *first = NULL;
 	size_t first_len = 0;
 
 	for (nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(bmarks, pair)) {
 
 		const char *bmark = nvpair_name(pair);
 		const char *source;
 
 		/* list structure: values must be snapshots XOR bookmarks */
 		if (nvpair_value_string(pair, &source) != 0)
 			return (-1);
 		if (dsl_bookmark_create_nvl_validate_pair(bmark, source) != 0)
 			return (-1);
 
 		/* same pool check */
 		if (first == NULL) {
 			const char *cp = strpbrk(bmark, "/#");
 			if (cp == NULL)
 				return (-1);
 			first = bmark;
 			first_len = cp - bmark;
 		}
 		if (strncmp(first, bmark, first_len) != 0)
 			return (-1);
 		switch (*(bmark + first_len)) {
 			case '/': /* fallthrough */
 			case '#':
 				break;
 			default:
 				return (-1);
 		}
 
 		/* unique newbookmark names; todo: O(n^2) */
 		for (nvpair_t *pair2 = nvlist_next_nvpair(bmarks, pair);
 		    pair2 != NULL; pair2 = nvlist_next_nvpair(bmarks, pair2)) {
 			if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0)
 				return (-1);
 		}
 
 	}
 	return (0);
 }
 
 /*
  * expects that newbm and source have been validated using
  * dsl_bookmark_create_nvl_validate_pair
  */
 static int
 dsl_bookmark_create_check_impl(dsl_pool_t *dp,
     const char *newbm, const char *source)
 {
 	ASSERT0(dsl_bookmark_create_nvl_validate_pair(newbm, source));
 	/* defer source namecheck until we know it's a snapshot or bookmark */
 
 	int error;
 	dsl_dataset_t *newbm_ds;
 	char *newbm_short;
 	zfs_bookmark_phys_t bmark_phys;
 
 	error = dsl_bookmark_hold_ds(dp, newbm, &newbm_ds, FTAG, &newbm_short);
 	if (error != 0)
 		return (error);
 
 	/* Verify that the new bookmark does not already exist */
 	error = dsl_bookmark_lookup_impl(newbm_ds, newbm_short, &bmark_phys);
 	switch (error) {
 	case ESRCH:
 		/* happy path: new bmark doesn't exist, proceed after switch */
 		break;
 	case 0:
 		error = SET_ERROR(EEXIST);
 		goto eholdnewbmds;
 	default:
 		/* dsl_bookmark_lookup_impl already did SET_ERROR */
 		goto eholdnewbmds;
 	}
 
 	/* error is retval of the following if-cascade */
 	if (strchr(source, '@') != NULL) {
 		dsl_dataset_t *source_snap_ds;
 		ASSERT3S(snapshot_namecheck(source, NULL, NULL), ==, 0);
 		error = dsl_dataset_hold(dp, source, FTAG, &source_snap_ds);
 		if (error == 0) {
 			VERIFY(source_snap_ds->ds_is_snapshot);
 			/*
 			 * Verify that source snapshot is an earlier point in
 			 * newbm_ds's timeline (source may be newbm_ds's origin)
 			 */
 			if (!dsl_dataset_is_before(newbm_ds, source_snap_ds, 0))
 				error = SET_ERROR(
 				    ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR);
 			dsl_dataset_rele(source_snap_ds, FTAG);
 		}
 	} else if (strchr(source, '#') != NULL) {
 		zfs_bookmark_phys_t source_phys;
 		ASSERT3S(bookmark_namecheck(source, NULL, NULL), ==, 0);
 		/*
 		 * Source must exists and be an earlier point in newbm_ds's
 		 * timeline (newbm_ds's origin may be a snap of source's ds)
 		 */
 		error = dsl_bookmark_lookup(dp, source, newbm_ds, &source_phys);
 		switch (error) {
 		case 0:
 			break; /* happy path */
 		case EXDEV:
 			error = SET_ERROR(ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR);
 			break;
 		default:
 			/* dsl_bookmark_lookup already did SET_ERROR */
 			break;
 		}
 	} else {
 		/*
 		 * dsl_bookmark_create_nvl_validate validates that source is
 		 * either snapshot or bookmark
 		 */
 		panic("unreachable code: %s", source);
 	}
 
 eholdnewbmds:
 	dsl_dataset_rele(newbm_ds, FTAG);
 	return (error);
 }
 
 int
 dsl_bookmark_create_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_bookmark_create_arg_t *dbca = arg;
 	int rv = 0;
 	int schema_err = 0;
 	ASSERT3P(dbca, !=, NULL);
 	ASSERT3P(dbca->dbca_bmarks, !=, NULL);
 	/* dbca->dbca_errors is allowed to be NULL */
 
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 
 	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
 		return (SET_ERROR(ENOTSUP));
 
 	if (dsl_bookmark_create_nvl_validate(dbca->dbca_bmarks) != 0)
 		rv = schema_err = SET_ERROR(EINVAL);
 
 	for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
 		const char *new = nvpair_name(pair);
 
 		int error = schema_err;
 		if (error == 0) {
 			const char *source = fnvpair_value_string(pair);
 			error = dsl_bookmark_create_check_impl(dp, new, source);
 			if (error != 0)
 				error = SET_ERROR(error);
 		}
 
 		if (error != 0) {
 			rv = error;
 			if (dbca->dbca_errors != NULL)
 				fnvlist_add_int32(dbca->dbca_errors,
 				    new, error);
 		}
 	}
 
 	return (rv);
 }
 
 static dsl_bookmark_node_t *
 dsl_bookmark_node_alloc(char *shortname)
 {
 	dsl_bookmark_node_t *dbn = kmem_alloc(sizeof (*dbn), KM_SLEEP);
 	dbn->dbn_name = spa_strdup(shortname);
 	dbn->dbn_dirty = B_FALSE;
 	mutex_init(&dbn->dbn_lock, NULL, MUTEX_DEFAULT, NULL);
 	return (dbn);
 }
 
 /*
  * Set the fields in the zfs_bookmark_phys_t based on the specified snapshot.
  */
 static void
 dsl_bookmark_set_phys(zfs_bookmark_phys_t *zbm, dsl_dataset_t *snap)
 {
 	spa_t *spa = dsl_dataset_get_spa(snap);
 	objset_t *mos = spa_get_dsl(spa)->dp_meta_objset;
 	dsl_dataset_phys_t *dsp = dsl_dataset_phys(snap);
 
 	memset(zbm, 0, sizeof (zfs_bookmark_phys_t));
 	zbm->zbm_guid = dsp->ds_guid;
 	zbm->zbm_creation_txg = dsp->ds_creation_txg;
 	zbm->zbm_creation_time = dsp->ds_creation_time;
 	zbm->zbm_redaction_obj = 0;
 
 	/*
 	 * If the dataset is encrypted create a larger bookmark to
 	 * accommodate the IVset guid. The IVset guid was added
 	 * after the encryption feature to prevent a problem with
 	 * raw sends. If we encounter an encrypted dataset without
 	 * an IVset guid we fall back to a normal bookmark.
 	 */
 	if (snap->ds_dir->dd_crypto_obj != 0 &&
 	    spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) {
 		(void) zap_lookup(mos, snap->ds_object,
 		    DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1,
 		    &zbm->zbm_ivset_guid);
 	}
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_WRITTEN)) {
 		zbm->zbm_flags = ZBM_FLAG_SNAPSHOT_EXISTS | ZBM_FLAG_HAS_FBN;
 		zbm->zbm_referenced_bytes_refd = dsp->ds_referenced_bytes;
 		zbm->zbm_compressed_bytes_refd = dsp->ds_compressed_bytes;
 		zbm->zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes;
 
 		dsl_dataset_t *nextds;
 		VERIFY0(dsl_dataset_hold_obj(snap->ds_dir->dd_pool,
 		    dsp->ds_next_snap_obj, FTAG, &nextds));
 		dsl_deadlist_space(&nextds->ds_deadlist,
 		    &zbm->zbm_referenced_freed_before_next_snap,
 		    &zbm->zbm_compressed_freed_before_next_snap,
 		    &zbm->zbm_uncompressed_freed_before_next_snap);
 		dsl_dataset_rele(nextds, FTAG);
 	}
 }
 
 /*
  * Add dsl_bookmark_node_t `dbn` to the given dataset and increment appropriate
  * SPA feature counters.
  */
 void
 dsl_bookmark_node_add(dsl_dataset_t *hds, dsl_bookmark_node_t *dbn,
     dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 
 	if (hds->ds_bookmarks_obj == 0) {
 		hds->ds_bookmarks_obj = zap_create_norm(mos,
 		    U8_TEXTPREP_TOUPPER, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0,
 		    tx);
 		spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
 
 		dsl_dataset_zapify(hds, tx);
 		VERIFY0(zap_add(mos, hds->ds_object,
 		    DS_FIELD_BOOKMARK_NAMES,
 		    sizeof (hds->ds_bookmarks_obj), 1,
 		    &hds->ds_bookmarks_obj, tx));
 	}
 
 	avl_add(&hds->ds_bookmarks, dbn);
 
 	/*
 	 * To maintain backwards compatibility with software that doesn't
 	 * understand SPA_FEATURE_BOOKMARK_V2, we need to use the smallest
 	 * possible bookmark size.
 	 */
 	uint64_t bookmark_phys_size = BOOKMARK_PHYS_SIZE_V1;
 	if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2) &&
 	    (dbn->dbn_phys.zbm_ivset_guid != 0 || dbn->dbn_phys.zbm_flags &
 	    ZBM_FLAG_HAS_FBN || dbn->dbn_phys.zbm_redaction_obj != 0)) {
 		bookmark_phys_size = BOOKMARK_PHYS_SIZE_V2;
 		spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2, tx);
 	}
 
 	zfs_bookmark_phys_t zero_phys = { 0 };
 	ASSERT0(memcmp(((char *)&dbn->dbn_phys) + bookmark_phys_size,
 	    &zero_phys, sizeof (zfs_bookmark_phys_t) - bookmark_phys_size));
 
 	VERIFY0(zap_add(mos, hds->ds_bookmarks_obj, dbn->dbn_name,
 	    sizeof (uint64_t), bookmark_phys_size / sizeof (uint64_t),
 	    &dbn->dbn_phys, tx));
 }
 
 /*
  * If redaction_list is non-null, we create a redacted bookmark and redaction
  * list, and store the object number of the redaction list in redact_obj.
  */
 static void
 dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot,
     dmu_tx_t *tx, uint64_t num_redact_snaps, uint64_t *redact_snaps,
     const void *tag, redaction_list_t **redaction_list)
 {
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 	dsl_dataset_t *snapds, *bmark_fs;
 	char *shortname;
 	boolean_t bookmark_redacted;
 	uint64_t *dsredactsnaps;
 	uint64_t dsnumsnaps;
 
 	VERIFY0(dsl_dataset_hold(dp, snapshot, FTAG, &snapds));
 	VERIFY0(dsl_bookmark_hold_ds(dp, bookmark, &bmark_fs, FTAG,
 	    &shortname));
 
 	dsl_bookmark_node_t *dbn = dsl_bookmark_node_alloc(shortname);
 	dsl_bookmark_set_phys(&dbn->dbn_phys, snapds);
 
 	bookmark_redacted = dsl_dataset_get_uint64_array_feature(snapds,
 	    SPA_FEATURE_REDACTED_DATASETS, &dsnumsnaps, &dsredactsnaps);
 	if (redaction_list != NULL || bookmark_redacted) {
 		redaction_list_t *local_rl;
 		boolean_t spill = B_FALSE;
 		if (bookmark_redacted) {
 			redact_snaps = dsredactsnaps;
 			num_redact_snaps = dsnumsnaps;
 		}
 		int bonuslen = sizeof (redaction_list_phys_t) +
 		    num_redact_snaps * sizeof (uint64_t);
 		if (bonuslen > dmu_bonus_max())
 			spill = B_TRUE;
 		dbn->dbn_phys.zbm_redaction_obj = dmu_object_alloc(mos,
 		    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
 		    DMU_OTN_UINT64_METADATA, spill ? 0 : bonuslen, tx);
 		spa_feature_incr(dp->dp_spa,
 		    SPA_FEATURE_REDACTION_BOOKMARKS, tx);
 		if (spill) {
 			spa_feature_incr(dp->dp_spa,
 			    SPA_FEATURE_REDACTION_LIST_SPILL, tx);
 		}
 
 		VERIFY0(dsl_redaction_list_hold_obj(dp,
 		    dbn->dbn_phys.zbm_redaction_obj, tag, &local_rl));
 		dsl_redaction_list_long_hold(dp, local_rl, tag);
 
 		if (!spill) {
 			ASSERT3U(local_rl->rl_bonus->db_size, >=, bonuslen);
 			dmu_buf_will_dirty(local_rl->rl_bonus, tx);
 		} else {
 			dmu_buf_t *db;
 			VERIFY0(dmu_spill_hold_by_bonus(local_rl->rl_bonus,
 			    DB_RF_MUST_SUCCEED, FTAG, &db));
 			dmu_buf_will_fill(db, tx, B_FALSE);
 			VERIFY0(dbuf_spill_set_blksz(db, P2ROUNDUP(bonuslen,
 			    SPA_MINBLOCKSIZE), tx));
 			local_rl->rl_phys = db->db_data;
 			local_rl->rl_dbuf = db;
 		}
 		memcpy(local_rl->rl_phys->rlp_snaps, redact_snaps,
 		    sizeof (uint64_t) * num_redact_snaps);
 		local_rl->rl_phys->rlp_num_snaps = num_redact_snaps;
 		if (bookmark_redacted) {
 			ASSERT3P(redaction_list, ==, NULL);
 			local_rl->rl_phys->rlp_last_blkid = UINT64_MAX;
 			local_rl->rl_phys->rlp_last_object = UINT64_MAX;
 			dsl_redaction_list_long_rele(local_rl, tag);
 			dsl_redaction_list_rele(local_rl, tag);
 		} else {
 			*redaction_list = local_rl;
 		}
 	}
 
 	if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) {
 		spa_feature_incr(dp->dp_spa,
 		    SPA_FEATURE_BOOKMARK_WRITTEN, tx);
 	}
 
 	dsl_bookmark_node_add(bmark_fs, dbn, tx);
 
 	spa_history_log_internal_ds(bmark_fs, "bookmark", tx,
 	    "name=%s creation_txg=%llu target_snap=%llu redact_obj=%llu",
 	    shortname, (longlong_t)dbn->dbn_phys.zbm_creation_txg,
 	    (longlong_t)snapds->ds_object,
 	    (longlong_t)dbn->dbn_phys.zbm_redaction_obj);
 
 	dsl_dataset_rele(bmark_fs, FTAG);
 	dsl_dataset_rele(snapds, FTAG);
 }
 
 
 static void
 dsl_bookmark_create_sync_impl_book(
     const char *new_name, const char *source_name, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *bmark_fs_source, *bmark_fs_new;
 	char *source_shortname, *new_shortname;
 	zfs_bookmark_phys_t source_phys;
 
 	VERIFY0(dsl_bookmark_hold_ds(dp, source_name, &bmark_fs_source, FTAG,
 	    &source_shortname));
 	VERIFY0(dsl_bookmark_hold_ds(dp, new_name, &bmark_fs_new, FTAG,
 	    &new_shortname));
 
 	/*
 	 * create a copy of the source bookmark by copying most of its members
 	 *
 	 * Caveat: bookmarking a redaction bookmark yields a normal bookmark
 	 * -----------------------------------------------------------------
 	 * Reasoning:
 	 * - The zbm_redaction_obj would be referred to by both source and new
 	 *   bookmark, but would be destroyed once either source or new is
 	 *   destroyed, resulting in use-after-free of the referred object.
 	 * - User expectation when issuing the `zfs bookmark` command is that
 	 *   a normal bookmark of the source is created
 	 *
 	 * Design Alternatives For Full Redaction Bookmark Copying:
 	 * - reference-count the redaction object => would require on-disk
 	 *   format change for existing redaction objects
 	 * - Copy the redaction object => cannot be done in syncing context
 	 *   because the redaction object might be too large
 	 */
 
 	VERIFY0(dsl_bookmark_lookup_impl(bmark_fs_source, source_shortname,
 	    &source_phys));
 	dsl_bookmark_node_t *new_dbn = dsl_bookmark_node_alloc(new_shortname);
 
 	memcpy(&new_dbn->dbn_phys, &source_phys, sizeof (source_phys));
 	new_dbn->dbn_phys.zbm_redaction_obj = 0;
 
 	/* update feature counters */
 	if (new_dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) {
 		spa_feature_incr(dp->dp_spa,
 		    SPA_FEATURE_BOOKMARK_WRITTEN, tx);
 	}
 	/* no need for redaction bookmark counter; nulled zbm_redaction_obj */
 	/* dsl_bookmark_node_add bumps bookmarks and v2-bookmarks counter */
 
 	/*
 	 * write new bookmark
 	 *
 	 * Note that dsl_bookmark_lookup_impl guarantees that, if source is a
 	 * v1 bookmark, the v2-only fields are zeroed.
 	 * And dsl_bookmark_node_add writes back a v1-sized bookmark if
 	 * v2 bookmarks are disabled and/or v2-only fields are zeroed.
 	 * => bookmark copying works on pre-bookmark-v2 pools
 	 */
 	dsl_bookmark_node_add(bmark_fs_new, new_dbn, tx);
 
 	spa_history_log_internal_ds(bmark_fs_source, "bookmark", tx,
 	    "name=%s creation_txg=%llu source_guid=%llu",
 	    new_shortname, (longlong_t)new_dbn->dbn_phys.zbm_creation_txg,
 	    (longlong_t)source_phys.zbm_guid);
 
 	dsl_dataset_rele(bmark_fs_source, FTAG);
 	dsl_dataset_rele(bmark_fs_new, FTAG);
 }
 
 void
 dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_bookmark_create_arg_t *dbca = arg;
 
 	ASSERT(spa_feature_is_enabled(dmu_tx_pool(tx)->dp_spa,
 	    SPA_FEATURE_BOOKMARKS));
 
 	for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
 
 		const char *new = nvpair_name(pair);
 		const char *source = fnvpair_value_string(pair);
 
 		if (strchr(source, '@') != NULL) {
 			dsl_bookmark_create_sync_impl_snap(new, source, tx,
 			    0, NULL, NULL, NULL);
 		} else if (strchr(source, '#') != NULL) {
 			dsl_bookmark_create_sync_impl_book(new, source, tx);
 		} else {
 			panic("unreachable code");
 		}
 
 	}
 }
 
 /*
  * The bookmarks must all be in the same pool.
  */
 int
 dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors)
 {
 	nvpair_t *pair;
 	dsl_bookmark_create_arg_t dbca;
 
 	pair = nvlist_next_nvpair(bmarks, NULL);
 	if (pair == NULL)
 		return (0);
 
 	dbca.dbca_bmarks = bmarks;
 	dbca.dbca_errors = errors;
 
 	return (dsl_sync_task(nvpair_name(pair), dsl_bookmark_create_check,
 	    dsl_bookmark_create_sync, &dbca,
 	    fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL));
 }
 
 static int
 dsl_bookmark_create_redacted_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_bookmark_create_redacted_arg_t *dbcra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	int rv = 0;
 
 	if (!spa_feature_is_enabled(dp->dp_spa,
 	    SPA_FEATURE_REDACTION_BOOKMARKS))
 		return (SET_ERROR(ENOTSUP));
 	/*
 	 * If the list of redact snaps will not fit in the bonus buffer (or
 	 * spill block, with the REDACTION_LIST_SPILL feature) with the
 	 * furthest reached object and offset, fail.
 	 */
 	uint64_t snaplimit = ((spa_feature_is_enabled(dp->dp_spa,
 	    SPA_FEATURE_REDACTION_LIST_SPILL) ? spa_maxblocksize(dp->dp_spa) :
 	    dmu_bonus_max()) -
 	    sizeof (redaction_list_phys_t)) / sizeof (uint64_t);
 	if (dbcra->dbcra_numsnaps > snaplimit)
 		return (SET_ERROR(E2BIG));
 
 	if (dsl_bookmark_create_nvl_validate_pair(
 	    dbcra->dbcra_bmark, dbcra->dbcra_snap) != 0)
 		return (SET_ERROR(EINVAL));
 
 	rv = dsl_bookmark_create_check_impl(dp,
 	    dbcra->dbcra_bmark, dbcra->dbcra_snap);
 	return (rv);
 }
 
 static void
 dsl_bookmark_create_redacted_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_bookmark_create_redacted_arg_t *dbcra = arg;
 	dsl_bookmark_create_sync_impl_snap(dbcra->dbcra_bmark,
 	    dbcra->dbcra_snap, tx, dbcra->dbcra_numsnaps, dbcra->dbcra_snaps,
 	    dbcra->dbcra_tag, dbcra->dbcra_rl);
 }
 
 int
 dsl_bookmark_create_redacted(const char *bookmark, const char *snapshot,
     uint64_t numsnaps, uint64_t *snapguids, const void *tag,
     redaction_list_t **rl)
 {
 	dsl_bookmark_create_redacted_arg_t dbcra;
 
 	dbcra.dbcra_bmark = bookmark;
 	dbcra.dbcra_snap = snapshot;
 	dbcra.dbcra_rl = rl;
 	dbcra.dbcra_numsnaps = numsnaps;
 	dbcra.dbcra_snaps = snapguids;
 	dbcra.dbcra_tag = tag;
 
 	return (dsl_sync_task(bookmark, dsl_bookmark_create_redacted_check,
 	    dsl_bookmark_create_redacted_sync, &dbcra, 5,
 	    ZFS_SPACE_CHECK_NORMAL));
 }
 
 /*
  * Retrieve the list of properties given in the 'props' nvlist for a bookmark.
  * If 'props' is NULL, retrieves all properties.
  */
 static void
 dsl_bookmark_fetch_props(dsl_pool_t *dp, zfs_bookmark_phys_t *bmark_phys,
     nvlist_t *props, nvlist_t *out_props)
 {
 	ASSERT3P(dp, !=, NULL);
 	ASSERT3P(bmark_phys, !=, NULL);
 	ASSERT3P(out_props, !=, NULL);
 	ASSERT(RRW_LOCK_HELD(&dp->dp_config_rwlock));
 
 	if (props == NULL || nvlist_exists(props,
 	    zfs_prop_to_name(ZFS_PROP_GUID))) {
 		dsl_prop_nvlist_add_uint64(out_props,
 		    ZFS_PROP_GUID, bmark_phys->zbm_guid);
 	}
 	if (props == NULL || nvlist_exists(props,
 	    zfs_prop_to_name(ZFS_PROP_CREATETXG))) {
 		dsl_prop_nvlist_add_uint64(out_props,
 		    ZFS_PROP_CREATETXG, bmark_phys->zbm_creation_txg);
 	}
 	if (props == NULL || nvlist_exists(props,
 	    zfs_prop_to_name(ZFS_PROP_CREATION))) {
 		dsl_prop_nvlist_add_uint64(out_props,
 		    ZFS_PROP_CREATION, bmark_phys->zbm_creation_time);
 	}
 	if (props == NULL || nvlist_exists(props,
 	    zfs_prop_to_name(ZFS_PROP_IVSET_GUID))) {
 		dsl_prop_nvlist_add_uint64(out_props,
 		    ZFS_PROP_IVSET_GUID, bmark_phys->zbm_ivset_guid);
 	}
 	if (bmark_phys->zbm_flags & ZBM_FLAG_HAS_FBN) {
 		if (props == NULL || nvlist_exists(props,
 		    zfs_prop_to_name(ZFS_PROP_REFERENCED))) {
 			dsl_prop_nvlist_add_uint64(out_props,
 			    ZFS_PROP_REFERENCED,
 			    bmark_phys->zbm_referenced_bytes_refd);
 		}
 		if (props == NULL || nvlist_exists(props,
 		    zfs_prop_to_name(ZFS_PROP_LOGICALREFERENCED))) {
 			dsl_prop_nvlist_add_uint64(out_props,
 			    ZFS_PROP_LOGICALREFERENCED,
 			    bmark_phys->zbm_uncompressed_bytes_refd);
 		}
 		if (props == NULL || nvlist_exists(props,
 		    zfs_prop_to_name(ZFS_PROP_REFRATIO))) {
 			uint64_t ratio =
 			    bmark_phys->zbm_compressed_bytes_refd == 0 ? 100 :
 			    bmark_phys->zbm_uncompressed_bytes_refd * 100 /
 			    bmark_phys->zbm_compressed_bytes_refd;
 			dsl_prop_nvlist_add_uint64(out_props,
 			    ZFS_PROP_REFRATIO, ratio);
 		}
 	}
 
 	if ((props == NULL || nvlist_exists(props, "redact_snaps") ||
 	    nvlist_exists(props, "redact_complete")) &&
 	    bmark_phys->zbm_redaction_obj != 0) {
 		redaction_list_t *rl;
 		int err = dsl_redaction_list_hold_obj(dp,
 		    bmark_phys->zbm_redaction_obj, FTAG, &rl);
 		if (err == 0) {
 			if (nvlist_exists(props, "redact_snaps")) {
 				nvlist_t *nvl;
 				nvl = fnvlist_alloc();
 				fnvlist_add_uint64_array(nvl, ZPROP_VALUE,
 				    rl->rl_phys->rlp_snaps,
 				    rl->rl_phys->rlp_num_snaps);
 				fnvlist_add_nvlist(out_props, "redact_snaps",
 				    nvl);
 				nvlist_free(nvl);
 			}
 			if (nvlist_exists(props, "redact_complete")) {
 				nvlist_t *nvl;
 				nvl = fnvlist_alloc();
 				fnvlist_add_boolean_value(nvl, ZPROP_VALUE,
 				    rl->rl_phys->rlp_last_blkid == UINT64_MAX &&
 				    rl->rl_phys->rlp_last_object == UINT64_MAX);
 				fnvlist_add_nvlist(out_props, "redact_complete",
 				    nvl);
 				nvlist_free(nvl);
 			}
 			dsl_redaction_list_rele(rl, FTAG);
 		}
 	}
 }
 
 int
 dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	if (dsl_dataset_is_snapshot(ds))
 		return (SET_ERROR(EINVAL));
 
 	for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks);
 	    dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) {
 		nvlist_t *out_props = fnvlist_alloc();
 
 		dsl_bookmark_fetch_props(dp, &dbn->dbn_phys, props, out_props);
 
 		fnvlist_add_nvlist(outnvl, dbn->dbn_name, out_props);
 		fnvlist_free(out_props);
 	}
 	return (0);
 }
 
 /*
  * Comparison func for ds_bookmarks AVL tree.  We sort the bookmarks by
  * their TXG, then by their FBN-ness.  The "FBN-ness" component ensures
  * that all bookmarks at the same TXG that HAS_FBN are adjacent, which
  * dsl_bookmark_destroy_sync_impl() depends on.  Note that there may be
  * multiple bookmarks at the same TXG (with the same FBN-ness).  In this
  * case we differentiate them by an arbitrary metric (in this case,
  * their names).
  */
 static int
 dsl_bookmark_compare(const void *l, const void *r)
 {
 	const dsl_bookmark_node_t *ldbn = l;
 	const dsl_bookmark_node_t *rdbn = r;
 
 	int64_t cmp = TREE_CMP(ldbn->dbn_phys.zbm_creation_txg,
 	    rdbn->dbn_phys.zbm_creation_txg);
 	if (likely(cmp))
 		return (cmp);
 	cmp = TREE_CMP((ldbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN),
 	    (rdbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN));
 	if (likely(cmp))
 		return (cmp);
 	cmp = strcmp(ldbn->dbn_name, rdbn->dbn_name);
 	return (TREE_ISIGN(cmp));
 }
 
 /*
  * Cache this (head) dataset's bookmarks in the ds_bookmarks AVL tree.
  */
 int
 dsl_bookmark_init_ds(dsl_dataset_t *ds)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 
 	ASSERT(!ds->ds_is_snapshot);
 
 	avl_create(&ds->ds_bookmarks, dsl_bookmark_compare,
 	    sizeof (dsl_bookmark_node_t),
 	    offsetof(dsl_bookmark_node_t, dbn_node));
 
 	if (!dsl_dataset_is_zapified(ds))
 		return (0);
 
 	int zaperr = zap_lookup(mos, ds->ds_object, DS_FIELD_BOOKMARK_NAMES,
 	    sizeof (ds->ds_bookmarks_obj), 1, &ds->ds_bookmarks_obj);
 	if (zaperr == ENOENT)
 		return (0);
 	if (zaperr != 0)
 		return (zaperr);
 
 	if (ds->ds_bookmarks_obj == 0)
 		return (0);
 
 	int err = 0;
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 
 	for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj);
 	    (err = zap_cursor_retrieve(&zc, &attr)) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_bookmark_node_t *dbn =
 		    dsl_bookmark_node_alloc(attr.za_name);
 
 		err = dsl_bookmark_lookup_impl(ds,
 		    dbn->dbn_name, &dbn->dbn_phys);
 		ASSERT3U(err, !=, ENOENT);
 		if (err != 0) {
 			kmem_free(dbn, sizeof (*dbn));
 			break;
 		}
 		avl_add(&ds->ds_bookmarks, dbn);
 	}
 	zap_cursor_fini(&zc);
 	if (err == ENOENT)
 		err = 0;
 	return (err);
 }
 
 void
 dsl_bookmark_fini_ds(dsl_dataset_t *ds)
 {
 	void *cookie = NULL;
 	dsl_bookmark_node_t *dbn;
 
 	if (ds->ds_is_snapshot)
 		return;
 
 	while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) != NULL) {
 		spa_strfree(dbn->dbn_name);
 		mutex_destroy(&dbn->dbn_lock);
 		kmem_free(dbn, sizeof (*dbn));
 	}
 	avl_destroy(&ds->ds_bookmarks);
 }
 
 /*
  * Retrieve the bookmarks that exist in the specified dataset, and the
  * requested properties of each bookmark.
  *
  * The "props" nvlist specifies which properties are requested.
  * See lzc_get_bookmarks() for the list of valid properties.
  */
 int
 dsl_get_bookmarks(const char *dsname, nvlist_t *props, nvlist_t *outnvl)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int err;
 
 	err = dsl_pool_hold(dsname, FTAG, &dp);
 	if (err != 0)
 		return (err);
 	err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
 	if (err != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (err);
 	}
 
 	err = dsl_get_bookmarks_impl(ds, props, outnvl);
 
 	dsl_dataset_rele(ds, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	return (err);
 }
 
 /*
  * Retrieve all properties for a single bookmark in the given dataset.
  */
 int
 dsl_get_bookmark_props(const char *dsname, const char *bmname, nvlist_t *props)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	zfs_bookmark_phys_t bmark_phys = { 0 };
 	int err;
 
 	err = dsl_pool_hold(dsname, FTAG, &dp);
 	if (err != 0)
 		return (err);
 	err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
 	if (err != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (err);
 	}
 
 	err = dsl_bookmark_lookup_impl(ds, bmname, &bmark_phys);
 	if (err != 0)
 		goto out;
 
 	dsl_bookmark_fetch_props(dp, &bmark_phys, NULL, props);
 out:
 	dsl_dataset_rele(ds, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	return (err);
 }
 
 typedef struct dsl_bookmark_destroy_arg {
 	nvlist_t *dbda_bmarks;
 	nvlist_t *dbda_success;
 	nvlist_t *dbda_errors;
 } dsl_bookmark_destroy_arg_t;
 
 static void
 dsl_bookmark_destroy_sync_impl(dsl_dataset_t *ds, const char *name,
     dmu_tx_t *tx)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t bmark_zapobj = ds->ds_bookmarks_obj;
 	matchtype_t mt = 0;
 	uint64_t int_size, num_ints;
 	/*
 	 * 'search' must be zeroed so that dbn_flags (which is used in
 	 * dsl_bookmark_compare()) will be zeroed even if the on-disk
 	 * (in ZAP) bookmark is shorter than offsetof(dbn_flags).
 	 */
 	dsl_bookmark_node_t search = { 0 };
 	char realname[ZFS_MAX_DATASET_NAME_LEN];
 
 	/*
 	 * Find the real name of this bookmark, which may be different
 	 * from the given name if the dataset is case-insensitive.  Then
 	 * use the real name to find the node in the ds_bookmarks AVL tree.
 	 */
 
 	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_NORMALIZE;
 
 	VERIFY0(zap_length(mos, bmark_zapobj, name, &int_size, &num_ints));
 
 	ASSERT3U(int_size, ==, sizeof (uint64_t));
 
 	if (num_ints * int_size > BOOKMARK_PHYS_SIZE_V1) {
 		spa_feature_decr(dmu_objset_spa(mos),
 		    SPA_FEATURE_BOOKMARK_V2, tx);
 	}
 	VERIFY0(zap_lookup_norm(mos, bmark_zapobj, name, sizeof (uint64_t),
 	    num_ints, &search.dbn_phys, mt, realname, sizeof (realname), NULL));
 
 	search.dbn_name = realname;
 	dsl_bookmark_node_t *dbn = avl_find(&ds->ds_bookmarks, &search, NULL);
 	ASSERT(dbn != NULL);
 
 	if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) {
 		/*
 		 * If this bookmark HAS_FBN, and it is before the most
 		 * recent snapshot, then its TXG is a key in the head's
 		 * deadlist (and all clones' heads' deadlists).  If this is
 		 * the last thing keeping the key (i.e. there are no more
 		 * bookmarks with HAS_FBN at this TXG, and there is no
 		 * snapshot at this TXG), then remove the key.
 		 *
 		 * Note that this algorithm depends on ds_bookmarks being
 		 * sorted such that all bookmarks at the same TXG with
 		 * HAS_FBN are adjacent (with no non-HAS_FBN bookmarks
 		 * at the same TXG in between them).  If this were not
 		 * the case, we would need to examine *all* bookmarks
 		 * at this TXG, rather than just the adjacent ones.
 		 */
 
 		dsl_bookmark_node_t *dbn_prev =
 		    AVL_PREV(&ds->ds_bookmarks, dbn);
 		dsl_bookmark_node_t *dbn_next =
 		    AVL_NEXT(&ds->ds_bookmarks, dbn);
 
 		boolean_t more_bookmarks_at_this_txg =
 		    (dbn_prev != NULL && dbn_prev->dbn_phys.zbm_creation_txg ==
 		    dbn->dbn_phys.zbm_creation_txg &&
 		    (dbn_prev->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) ||
 		    (dbn_next != NULL && dbn_next->dbn_phys.zbm_creation_txg ==
 		    dbn->dbn_phys.zbm_creation_txg &&
 		    (dbn_next->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN));
 
 		if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS) &&
 		    !more_bookmarks_at_this_txg &&
 		    dbn->dbn_phys.zbm_creation_txg <
 		    dsl_dataset_phys(ds)->ds_prev_snap_txg) {
 			dsl_dir_remove_clones_key(ds->ds_dir,
 			    dbn->dbn_phys.zbm_creation_txg, tx);
 			dsl_deadlist_remove_key(&ds->ds_deadlist,
 			    dbn->dbn_phys.zbm_creation_txg, tx);
 		}
 
 		spa_feature_decr(dmu_objset_spa(mos),
 		    SPA_FEATURE_BOOKMARK_WRITTEN, tx);
 	}
 
 	if (dbn->dbn_phys.zbm_redaction_obj != 0) {
 		dnode_t *rl;
 		VERIFY0(dnode_hold(mos,
 		    dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl));
 		if (rl->dn_have_spill) {
 			spa_feature_decr(dmu_objset_spa(mos),
 			    SPA_FEATURE_REDACTION_LIST_SPILL, tx);
 		}
 		dnode_rele(rl, FTAG);
 		VERIFY0(dmu_object_free(mos,
 		    dbn->dbn_phys.zbm_redaction_obj, tx));
 		spa_feature_decr(dmu_objset_spa(mos),
 		    SPA_FEATURE_REDACTION_BOOKMARKS, tx);
 	}
 
 	avl_remove(&ds->ds_bookmarks, dbn);
 	spa_strfree(dbn->dbn_name);
 	mutex_destroy(&dbn->dbn_lock);
 	kmem_free(dbn, sizeof (*dbn));
 
 	VERIFY0(zap_remove_norm(mos, bmark_zapobj, name, mt, tx));
 }
 
 static int
 dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_bookmark_destroy_arg_t *dbda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	int rv = 0;
 
 	ASSERT(nvlist_empty(dbda->dbda_success));
 	ASSERT(nvlist_empty(dbda->dbda_errors));
 
 	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
 		return (0);
 
 	for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_bmarks, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_bmarks, pair)) {
 		const char *fullname = nvpair_name(pair);
 		dsl_dataset_t *ds;
 		zfs_bookmark_phys_t bm;
 		int error;
 		char *shortname;
 
 		error = dsl_bookmark_hold_ds(dp, fullname, &ds,
 		    FTAG, &shortname);
 		if (error == ENOENT) {
 			/* ignore it; the bookmark is "already destroyed" */
 			continue;
 		}
 		if (error == 0) {
 			error = dsl_bookmark_lookup_impl(ds, shortname, &bm);
 			dsl_dataset_rele(ds, FTAG);
 			if (error == ESRCH) {
 				/*
 				 * ignore it; the bookmark is
 				 * "already destroyed"
 				 */
 				continue;
 			}
 			if (error == 0 && bm.zbm_redaction_obj != 0) {
 				redaction_list_t *rl = NULL;
 				error = dsl_redaction_list_hold_obj(tx->tx_pool,
 				    bm.zbm_redaction_obj, FTAG, &rl);
 				if (error == ENOENT) {
 					error = 0;
 				} else if (error == 0 &&
 				    dsl_redaction_list_long_held(rl)) {
 					error = SET_ERROR(EBUSY);
 				}
 				if (rl != NULL) {
 					dsl_redaction_list_rele(rl, FTAG);
 				}
 			}
 		}
 		if (error == 0) {
 			if (dmu_tx_is_syncing(tx)) {
 				fnvlist_add_boolean(dbda->dbda_success,
 				    fullname);
 			}
 		} else {
 			fnvlist_add_int32(dbda->dbda_errors, fullname, error);
 			rv = error;
 		}
 	}
 	return (rv);
 }
 
 static void
 dsl_bookmark_destroy_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_bookmark_destroy_arg_t *dbda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 
 	for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_success, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_success, pair)) {
 		dsl_dataset_t *ds;
 		char *shortname;
 		uint64_t zap_cnt;
 
 		VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair),
 		    &ds, FTAG, &shortname));
 		dsl_bookmark_destroy_sync_impl(ds, shortname, tx);
 
 		/*
 		 * If all of this dataset's bookmarks have been destroyed,
 		 * free the zap object and decrement the feature's use count.
 		 */
 		VERIFY0(zap_count(mos, ds->ds_bookmarks_obj, &zap_cnt));
 		if (zap_cnt == 0) {
 			dmu_buf_will_dirty(ds->ds_dbuf, tx);
 			VERIFY0(zap_destroy(mos, ds->ds_bookmarks_obj, tx));
 			ds->ds_bookmarks_obj = 0;
 			spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
 			VERIFY0(zap_remove(mos, ds->ds_object,
 			    DS_FIELD_BOOKMARK_NAMES, tx));
 		}
 
 		spa_history_log_internal_ds(ds, "remove bookmark", tx,
 		    "name=%s", shortname);
 
 		dsl_dataset_rele(ds, FTAG);
 	}
 }
 
 /*
  * The bookmarks must all be in the same pool.
  */
 int
 dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors)
 {
 	int rv;
 	dsl_bookmark_destroy_arg_t dbda;
 	nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL);
 	if (pair == NULL)
 		return (0);
 
 	dbda.dbda_bmarks = bmarks;
 	dbda.dbda_errors = errors;
 	dbda.dbda_success = fnvlist_alloc();
 
 	rv = dsl_sync_task(nvpair_name(pair), dsl_bookmark_destroy_check,
 	    dsl_bookmark_destroy_sync, &dbda, fnvlist_num_pairs(bmarks),
 	    ZFS_SPACE_CHECK_RESERVED);
 	fnvlist_free(dbda.dbda_success);
 	return (rv);
 }
 
 /* Return B_TRUE if there are any long holds on this dataset. */
 boolean_t
 dsl_redaction_list_long_held(redaction_list_t *rl)
 {
 	return (!zfs_refcount_is_zero(&rl->rl_longholds));
 }
 
 void
 dsl_redaction_list_long_hold(dsl_pool_t *dp, redaction_list_t *rl,
     const void *tag)
 {
 	ASSERT(dsl_pool_config_held(dp));
 	(void) zfs_refcount_add(&rl->rl_longholds, tag);
 }
 
 void
 dsl_redaction_list_long_rele(redaction_list_t *rl, const void *tag)
 {
 	(void) zfs_refcount_remove(&rl->rl_longholds, tag);
 }
 
 static void
 redaction_list_evict_sync(void *rlu)
 {
 	redaction_list_t *rl = rlu;
 	zfs_refcount_destroy(&rl->rl_longholds);
 
 	kmem_free(rl, sizeof (redaction_list_t));
 }
 
 void
 dsl_redaction_list_rele(redaction_list_t *rl, const void *tag)
 {
 	if (rl->rl_bonus != rl->rl_dbuf)
 		dmu_buf_rele(rl->rl_dbuf, tag);
 	dmu_buf_rele(rl->rl_bonus, tag);
 }
 
 int
 dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, const void *tag,
     redaction_list_t **rlp)
 {
 	objset_t *mos = dp->dp_meta_objset;
 	dmu_buf_t *dbuf, *spill_dbuf;
 	redaction_list_t *rl;
 	int err;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	err = dmu_bonus_hold(mos, rlobj, tag, &dbuf);
 	if (err != 0)
 		return (err);
 
 	rl = dmu_buf_get_user(dbuf);
 	if (rl == NULL) {
 		redaction_list_t *winner = NULL;
 
 		rl = kmem_zalloc(sizeof (redaction_list_t), KM_SLEEP);
 		rl->rl_bonus = dbuf;
 		if (dmu_spill_hold_existing(dbuf, tag, &spill_dbuf) == 0) {
 			rl->rl_dbuf = spill_dbuf;
 		} else {
 			rl->rl_dbuf = dbuf;
 		}
 		rl->rl_object = rlobj;
 		rl->rl_phys = rl->rl_dbuf->db_data;
 		rl->rl_mos = dp->dp_meta_objset;
 		zfs_refcount_create(&rl->rl_longholds);
 		dmu_buf_init_user(&rl->rl_dbu, redaction_list_evict_sync, NULL,
 		    &rl->rl_bonus);
 		if ((winner = dmu_buf_set_user_ie(dbuf, &rl->rl_dbu)) != NULL) {
 			kmem_free(rl, sizeof (*rl));
 			rl = winner;
 		}
 	}
 	*rlp = rl;
 	return (0);
 }
 
 /*
  * Snapshot ds is being destroyed.
  *
  * Adjust the "freed_before_next" of any bookmarks between this snap
  * and the previous snapshot, because their "next snapshot" is changing.
  *
  * If there are any bookmarks with HAS_FBN at this snapshot, remove
  * their HAS_SNAP flag (note: there can be at most one snapshot of
  * each filesystem at a given txg), and return B_TRUE.  In this case
  * the caller can not remove the key in the deadlist at this TXG, because
  * the HAS_FBN bookmarks require the key be there.
  *
  * Returns B_FALSE if there are no bookmarks with HAS_FBN at this
  * snapshot's TXG.  In this case the caller can remove the key in the
  * deadlist at this TXG.
  */
 boolean_t
 dsl_bookmark_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
 	dsl_dataset_t *head, *next;
 	VERIFY0(dsl_dataset_hold_obj(dp,
 	    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &head));
 	VERIFY0(dsl_dataset_hold_obj(dp,
 	    dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &next));
 
 	/*
 	 * Find the first bookmark that HAS_FBN at or after the
 	 * previous snapshot.
 	 */
 	dsl_bookmark_node_t search = { 0 };
 	avl_index_t idx;
 	search.dbn_phys.zbm_creation_txg =
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg;
 	search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN;
 	/*
 	 * The empty-string name can't be in the AVL, and it compares
 	 * before any entries with this TXG.
 	 */
 	search.dbn_name = (char *)"";
 	VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL);
 	dsl_bookmark_node_t *dbn =
 	    avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER);
 
 	/*
 	 * Iterate over all bookmarks that are at or after the previous
 	 * snapshot, and before this (being deleted) snapshot.  Adjust
 	 * their FBN based on their new next snapshot.
 	 */
 	for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg <
 	    dsl_dataset_phys(ds)->ds_creation_txg;
 	    dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) {
 		if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN))
 			continue;
 		/*
 		 * Increase our FBN by the amount of space that was live
 		 * (referenced) at the time of this bookmark (i.e.
 		 * birth <= zbm_creation_txg), and killed between this
 		 * (being deleted) snapshot and the next snapshot (i.e.
 		 * on the next snapshot's deadlist).  (Space killed before
 		 * this are already on our FBN.)
 		 */
 		uint64_t referenced, compressed, uncompressed;
 		dsl_deadlist_space_range(&next->ds_deadlist,
 		    0, dbn->dbn_phys.zbm_creation_txg,
 		    &referenced, &compressed, &uncompressed);
 		dbn->dbn_phys.zbm_referenced_freed_before_next_snap +=
 		    referenced;
 		dbn->dbn_phys.zbm_compressed_freed_before_next_snap +=
 		    compressed;
 		dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap +=
 		    uncompressed;
 		VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj,
 		    dbn->dbn_name, sizeof (uint64_t),
 		    sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
 		    &dbn->dbn_phys, tx));
 	}
 	dsl_dataset_rele(next, FTAG);
 
 	/*
 	 * There may be several bookmarks at this txg (the TXG of the
 	 * snapshot being deleted).  We need to clear the SNAPSHOT_EXISTS
 	 * flag on all of them, and return TRUE if there is at least 1
 	 * bookmark here with HAS_FBN (thus preventing the deadlist
 	 * key from being removed).
 	 */
 	boolean_t rv = B_FALSE;
 	for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg ==
 	    dsl_dataset_phys(ds)->ds_creation_txg;
 	    dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) {
 		if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) {
 			ASSERT(!(dbn->dbn_phys.zbm_flags &
 			    ZBM_FLAG_SNAPSHOT_EXISTS));
 			continue;
 		}
 		ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS);
 		dbn->dbn_phys.zbm_flags &= ~ZBM_FLAG_SNAPSHOT_EXISTS;
 		VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj,
 		    dbn->dbn_name, sizeof (uint64_t),
 		    sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
 		    &dbn->dbn_phys, tx));
 		rv = B_TRUE;
 	}
 	dsl_dataset_rele(head, FTAG);
 	return (rv);
 }
 
 /*
  * A snapshot is being created of this (head) dataset.
  *
  * We don't keep keys in the deadlist for the most recent snapshot, or any
  * bookmarks at or after it, because there can't be any blocks on the
  * deadlist in this range.  Now that the most recent snapshot is after
  * all bookmarks, we need to add these keys.  Note that the caller always
  * adds a key at the previous snapshot, so we only add keys for bookmarks
  * after that.
  */
 void
 dsl_bookmark_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	uint64_t last_key_added = UINT64_MAX;
 	for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks);
 	    dbn != NULL && dbn->dbn_phys.zbm_creation_txg >
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg;
 	    dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) {
 		uint64_t creation_txg = dbn->dbn_phys.zbm_creation_txg;
 		ASSERT3U(creation_txg, <=, last_key_added);
 		/*
 		 * Note, there may be multiple bookmarks at this TXG,
 		 * and we only want to add the key for this TXG once.
 		 * The ds_bookmarks AVL is sorted by TXG, so we will visit
 		 * these bookmarks in sequence.
 		 */
 		if ((dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) &&
 		    creation_txg != last_key_added) {
 			dsl_deadlist_add_key(&ds->ds_deadlist,
 			    creation_txg, tx);
 			last_key_added = creation_txg;
 		}
 	}
 }
 
 /*
  * The next snapshot of the origin dataset has changed, due to
  * promote or clone swap.  If there are any bookmarks at this dataset,
  * we need to update their zbm_*_freed_before_next_snap to reflect this.
  * The head dataset has the relevant bookmarks in ds_bookmarks.
  */
 void
 dsl_bookmark_next_changed(dsl_dataset_t *head, dsl_dataset_t *origin,
     dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 
 	/*
 	 * Find the first bookmark that HAS_FBN at the origin snapshot.
 	 */
 	dsl_bookmark_node_t search = { 0 };
 	avl_index_t idx;
 	search.dbn_phys.zbm_creation_txg =
 	    dsl_dataset_phys(origin)->ds_creation_txg;
 	search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN;
 	/*
 	 * The empty-string name can't be in the AVL, and it compares
 	 * before any entries with this TXG.
 	 */
 	search.dbn_name = (char *)"";
 	VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL);
 	dsl_bookmark_node_t *dbn =
 	    avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER);
 
 	/*
 	 * Iterate over all bookmarks that are at the origin txg.
 	 * Adjust their FBN based on their new next snapshot.
 	 */
 	for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg ==
 	    dsl_dataset_phys(origin)->ds_creation_txg &&
 	    (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN);
 	    dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) {
 
 		/*
 		 * Bookmark is at the origin, therefore its
 		 * "next dataset" is changing, so we need
 		 * to reset its FBN by recomputing it in
 		 * dsl_bookmark_set_phys().
 		 */
 		ASSERT3U(dbn->dbn_phys.zbm_guid, ==,
 		    dsl_dataset_phys(origin)->ds_guid);
 		ASSERT3U(dbn->dbn_phys.zbm_referenced_bytes_refd, ==,
 		    dsl_dataset_phys(origin)->ds_referenced_bytes);
 		ASSERT(dbn->dbn_phys.zbm_flags &
 		    ZBM_FLAG_SNAPSHOT_EXISTS);
 		/*
 		 * Save and restore the zbm_redaction_obj, which
 		 * is zeroed by dsl_bookmark_set_phys().
 		 */
 		uint64_t redaction_obj =
 		    dbn->dbn_phys.zbm_redaction_obj;
 		dsl_bookmark_set_phys(&dbn->dbn_phys, origin);
 		dbn->dbn_phys.zbm_redaction_obj = redaction_obj;
 
 		VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj,
 		    dbn->dbn_name, sizeof (uint64_t),
 		    sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
 		    &dbn->dbn_phys, tx));
 	}
 }
 
 /*
  * This block is no longer referenced by this (head) dataset.
  *
  * Adjust the FBN of any bookmarks that reference this block, whose "next"
  * is the head dataset.
  */
 void
 dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	(void) tx;
 
 	/*
 	 * Iterate over bookmarks whose "next" is the head dataset.
 	 */
 	for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks);
 	    dbn != NULL && dbn->dbn_phys.zbm_creation_txg >=
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg;
 	    dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) {
 		/*
 		 * If the block was live (referenced) at the time of this
 		 * bookmark, add its space to the bookmark's FBN.
 		 */
-		if (bp->blk_birth <= dbn->dbn_phys.zbm_creation_txg &&
+		if (BP_GET_LOGICAL_BIRTH(bp) <=
+		    dbn->dbn_phys.zbm_creation_txg &&
 		    (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) {
 			mutex_enter(&dbn->dbn_lock);
 			dbn->dbn_phys.zbm_referenced_freed_before_next_snap +=
 			    bp_get_dsize_sync(dsl_dataset_get_spa(ds), bp);
 			dbn->dbn_phys.zbm_compressed_freed_before_next_snap +=
 			    BP_GET_PSIZE(bp);
 			dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap +=
 			    BP_GET_UCSIZE(bp);
 			/*
 			 * Changing the ZAP object here would be too
 			 * expensive.  Also, we may be called from the zio
 			 * interrupt thread, which can't block on i/o.
 			 * Therefore, we mark this bookmark as dirty and
 			 * modify the ZAP once per txg, in
 			 * dsl_bookmark_sync_done().
 			 */
 			dbn->dbn_dirty = B_TRUE;
 			mutex_exit(&dbn->dbn_lock);
 		}
 	}
 }
 
 void
 dsl_bookmark_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 
 	if (dsl_dataset_is_snapshot(ds))
 		return;
 
 	/*
 	 * We only dirty bookmarks that are at or after the most recent
 	 * snapshot.  We can't create snapshots between
 	 * dsl_bookmark_block_killed() and dsl_bookmark_sync_done(), so we
 	 * don't need to look at any bookmarks before ds_prev_snap_txg.
 	 */
 	for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks);
 	    dbn != NULL && dbn->dbn_phys.zbm_creation_txg >=
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg;
 	    dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) {
 		if (dbn->dbn_dirty) {
 			/*
 			 * We only dirty nodes with HAS_FBN, therefore
 			 * we can always use the current bookmark struct size.
 			 */
 			ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN);
 			VERIFY0(zap_update(dp->dp_meta_objset,
 			    ds->ds_bookmarks_obj,
 			    dbn->dbn_name, sizeof (uint64_t),
 			    sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
 			    &dbn->dbn_phys, tx));
 			dbn->dbn_dirty = B_FALSE;
 		}
 	}
 #ifdef ZFS_DEBUG
 	for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks);
 	    dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) {
 		ASSERT(!dbn->dbn_dirty);
 	}
 #endif
 }
 
 /*
  * Return the TXG of the most recent bookmark (or 0 if there are no bookmarks).
  */
 uint64_t
 dsl_bookmark_latest_txg(dsl_dataset_t *ds)
 {
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 	dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks);
 	if (dbn == NULL)
 		return (0);
 	return (dbn->dbn_phys.zbm_creation_txg);
 }
 
 /*
  * Compare the redact_block_phys_t to the bookmark. If the last block in the
  * redact_block_phys_t is before the bookmark, return -1.  If the first block in
  * the redact_block_phys_t is after the bookmark, return 1.  Otherwise, the
  * bookmark is inside the range of the redact_block_phys_t, and we return 0.
  */
 static int
 redact_block_zb_compare(redact_block_phys_t *first,
     zbookmark_phys_t *second)
 {
 	/*
 	 * If the block_phys is for a previous object, or the last block in the
 	 * block_phys is strictly before the block in the bookmark, the
 	 * block_phys is earlier.
 	 */
 	if (first->rbp_object < second->zb_object ||
 	    (first->rbp_object == second->zb_object &&
 	    first->rbp_blkid + (redact_block_get_count(first) - 1) <
 	    second->zb_blkid)) {
 		return (-1);
 	}
 
 	/*
 	 * If the bookmark is for a previous object, or the block in the
 	 * bookmark is strictly before the first block in the block_phys, the
 	 * bookmark is earlier.
 	 */
 	if (first->rbp_object > second->zb_object ||
 	    (first->rbp_object == second->zb_object &&
 	    first->rbp_blkid > second->zb_blkid)) {
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Traverse the redaction list in the provided object, and call the callback for
  * each entry we find. Don't call the callback for any records before resume.
  */
 int
 dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume,
     rl_traverse_callback_t cb, void *arg)
 {
 	objset_t *mos = rl->rl_mos;
 	int err = 0;
 
 	if (rl->rl_phys->rlp_last_object != UINT64_MAX ||
 	    rl->rl_phys->rlp_last_blkid != UINT64_MAX) {
 		/*
 		 * When we finish a send, we update the last object and offset
 		 * to UINT64_MAX.  If a send fails partway through, the last
 		 * object and offset will have some other value, indicating how
 		 * far the send got. The redaction list must be complete before
 		 * it can be traversed, so return EINVAL if the last object and
 		 * blkid are not set to UINT64_MAX.
 		 */
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * This allows us to skip the binary search and resume checking logic
 	 * below, if we're not resuming a redacted send.
 	 */
 	if (ZB_IS_ZERO(resume))
 		resume = NULL;
 
 	/*
 	 * Binary search for the point to resume from.
 	 */
 	uint64_t maxidx = rl->rl_phys->rlp_num_entries - 1;
 	uint64_t minidx = 0;
 	while (resume != NULL && maxidx > minidx) {
 		redact_block_phys_t rbp = { 0 };
 		ASSERT3U(maxidx, >, minidx);
 		uint64_t mididx = minidx + ((maxidx - minidx) / 2);
 		err = dmu_read(mos, rl->rl_object, mididx * sizeof (rbp),
 		    sizeof (rbp), &rbp, DMU_READ_NO_PREFETCH);
 		if (err != 0)
 			break;
 
 		int cmp = redact_block_zb_compare(&rbp, resume);
 
 		if (cmp == 0) {
 			minidx = mididx;
 			break;
 		} else if (cmp > 0) {
 			maxidx =
 			    (mididx == minidx ? minidx : mididx - 1);
 		} else {
 			minidx = mididx + 1;
 		}
 	}
 
 	unsigned int bufsize = SPA_OLD_MAXBLOCKSIZE;
 	redact_block_phys_t *buf = zio_data_buf_alloc(bufsize);
 
 	unsigned int entries_per_buf = bufsize / sizeof (redact_block_phys_t);
 	uint64_t start_block = minidx / entries_per_buf;
 	err = dmu_read(mos, rl->rl_object, start_block * bufsize, bufsize, buf,
 	    DMU_READ_PREFETCH);
 
 	for (uint64_t curidx = minidx;
 	    err == 0 && curidx < rl->rl_phys->rlp_num_entries;
 	    curidx++) {
 		/*
 		 * We read in the redaction list one block at a time.  Once we
 		 * finish with all the entries in a given block, we read in a
 		 * new one.  The predictive prefetcher will take care of any
 		 * prefetching, and this code shouldn't be the bottleneck, so we
 		 * don't need to do manual prefetching.
 		 */
 		if (curidx % entries_per_buf == 0) {
 			err = dmu_read(mos, rl->rl_object, curidx *
 			    sizeof (*buf), bufsize, buf,
 			    DMU_READ_PREFETCH);
 			if (err != 0)
 				break;
 		}
 		redact_block_phys_t *rb = &buf[curidx % entries_per_buf];
 		/*
 		 * If resume is non-null, we should either not send the data, or
 		 * null out resume so we don't have to keep doing these
 		 * comparisons.
 		 */
 		if (resume != NULL) {
 			/*
 			 * It is possible that after the binary search we got
 			 * a record before the resume point. There's two cases
 			 * where this can occur. If the record is the last
 			 * redaction record, and the resume point is after the
 			 * end of the redacted data, curidx will be the last
 			 * redaction record. In that case, the loop will end
 			 * after this iteration. The second case is if the
 			 * resume point is between two redaction records, the
 			 * binary search can return either the record before
 			 * or after the resume point. In that case, the next
 			 * iteration will be greater than the resume point.
 			 */
 			if (redact_block_zb_compare(rb, resume) < 0) {
 				ASSERT3U(curidx, ==, minidx);
 				continue;
 			} else {
 				/*
 				 * If the place to resume is in the middle of
 				 * the range described by this
 				 * redact_block_phys, then modify the
 				 * redact_block_phys in memory so we generate
 				 * the right records.
 				 */
 				if (resume->zb_object == rb->rbp_object &&
 				    resume->zb_blkid > rb->rbp_blkid) {
 					uint64_t diff = resume->zb_blkid -
 					    rb->rbp_blkid;
 					rb->rbp_blkid = resume->zb_blkid;
 					redact_block_set_count(rb,
 					    redact_block_get_count(rb) - diff);
 				}
 				resume = NULL;
 			}
 		}
 
 		if (cb(rb, arg) != 0) {
 			err = EINTR;
 			break;
 		}
 	}
 
 	zio_data_buf_free(buf, bufsize);
 	return (err);
 }
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 62a1649d3786..b4de0e7ff073 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -1,5015 +1,5016 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 RackTop Systems.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2020 The FreeBSD Foundation [1]
  *
  * [1] Portions of this software were developed by Allan Jude
  *     under sponsorship from the FreeBSD Foundation.
  */
 
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 #include <sys/unique.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_onexit.h>
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
 #include <sys/dsl_deadlist.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_userhold.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/policy.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_recv.h>
 #include <sys/zio_compress.h>
 #include <zfs_fletcher.h>
 #include <sys/zio_checksum.h>
 
 /*
  * The SPA supports block sizes up to 16MB.  However, very large blocks
  * can have an impact on i/o latency (e.g. tying up a spinning disk for
  * ~300ms), and also potentially on the memory allocator.  Therefore,
  * we did not allow the recordsize to be set larger than zfs_max_recordsize
  * (former default: 1MB).  Larger blocks could be created by changing this
  * tunable, and pools with larger blocks could always be imported and used,
  * regardless of this setting.
  *
  * We do, however, still limit it by default to 1M on x86_32, because Linux's
  * 3/1 memory split doesn't leave much room for 16M chunks.
  */
 #ifdef _ILP32
 uint_t zfs_max_recordsize =  1 * 1024 * 1024;
 #else
 uint_t zfs_max_recordsize = 16 * 1024 * 1024;
 #endif
 static int zfs_allow_redacted_dataset_mount = 0;
 
 int zfs_snapshot_history_enabled = 1;
 
 #define	SWITCH64(x, y) \
 	{ \
 		uint64_t __tmp = (x); \
 		(x) = (y); \
 		(y) = __tmp; \
 	}
 
 #define	DS_REF_MAX	(1ULL << 62)
 
 static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds,
     uint64_t obj, dmu_tx_t *tx);
 static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds,
     dmu_tx_t *tx);
 
 static void unload_zfeature(dsl_dataset_t *ds, spa_feature_t f);
 
 extern uint_t spa_asize_inflation;
 
 static zil_header_t zero_zil;
 
 /*
  * Figure out how much of this delta should be propagated to the dsl_dir
  * layer.  If there's a refreservation, that space has already been
  * partially accounted for in our ancestors.
  */
 static int64_t
 parent_delta(dsl_dataset_t *ds, int64_t delta)
 {
 	dsl_dataset_phys_t *ds_phys;
 	uint64_t old_bytes, new_bytes;
 
 	if (ds->ds_reserved == 0)
 		return (delta);
 
 	ds_phys = dsl_dataset_phys(ds);
 	old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
 	new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
 
 	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
 	return (new_bytes - old_bytes);
 }
 
 void
 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	int used = bp_get_dsize_sync(spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 	int64_t delta;
 	spa_feature_t f;
 
 	dprintf_bp(bp, "ds=%p", ds);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* It could have been compressed away to nothing */
 	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
 		return;
 	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
 	ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
 	if (ds == NULL) {
 		dsl_pool_mos_diduse_space(tx->tx_pool,
 		    used, compressed, uncompressed);
 		return;
 	}
 
-	ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	mutex_enter(&ds->ds_lock);
 	delta = parent_delta(ds, used);
 	dsl_dataset_phys(ds)->ds_referenced_bytes += used;
 	dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
 	dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
 	dsl_dataset_phys(ds)->ds_unique_bytes += used;
 
 	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
 		ds->ds_feature_activation[SPA_FEATURE_LARGE_BLOCKS] =
 		    (void *)B_TRUE;
 	}
 
 
 	f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
 	if (f != SPA_FEATURE_NONE) {
 		ASSERT3S(spa_feature_table[f].fi_type, ==,
 		    ZFEATURE_TYPE_BOOLEAN);
 		ds->ds_feature_activation[f] = (void *)B_TRUE;
 	}
 
 	f = zio_compress_to_feature(BP_GET_COMPRESS(bp));
 	if (f != SPA_FEATURE_NONE) {
 		ASSERT3S(spa_feature_table[f].fi_type, ==,
 		    ZFEATURE_TYPE_BOOLEAN);
 		ds->ds_feature_activation[f] = (void *)B_TRUE;
 	}
 
 	/*
 	 * Track block for livelist, but ignore embedded blocks because
 	 * they do not need to be freed.
 	 */
 	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-	    bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+	    BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
 	    !(BP_IS_EMBEDDED(bp))) {
 		ASSERT(dsl_dir_is_clone(ds->ds_dir));
 		ASSERT(spa_feature_is_enabled(spa,
 		    SPA_FEATURE_LIVELIST));
 		bplist_append(&ds->ds_dir->dd_pending_allocs, bp);
 	}
 
 	mutex_exit(&ds->ds_lock);
 	dsl_dir_diduse_transfer_space(ds->ds_dir, delta,
 	    compressed, uncompressed, used,
 	    DD_USED_REFRSRV, DD_USED_HEAD, tx);
 }
 
 /*
  * Called when the specified segment has been remapped, and is thus no
  * longer referenced in the head dataset.  The vdev must be indirect.
  *
  * If the segment is referenced by a snapshot, put it on the remap deadlist.
  * Otherwise, add this segment to the obsolete spacemap.
  */
 void
 dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
     uint64_t size, uint64_t birth, dmu_tx_t *tx)
 {
 	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(birth <= tx->tx_txg);
 	ASSERT(!ds->ds_is_snapshot);
 
 	if (birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
 		spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
 	} else {
 		blkptr_t fakebp;
 		dva_t *dva = &fakebp.blk_dva[0];
 
 		ASSERT(ds != NULL);
 
 		mutex_enter(&ds->ds_remap_deadlist_lock);
 		if (!dsl_dataset_remap_deadlist_exists(ds)) {
 			dsl_dataset_create_remap_deadlist(ds, tx);
 		}
 		mutex_exit(&ds->ds_remap_deadlist_lock);
 
 		BP_ZERO(&fakebp);
-		fakebp.blk_birth = birth;
+		BP_SET_LOGICAL_BIRTH(&fakebp, birth);
 		DVA_SET_VDEV(dva, vdev);
 		DVA_SET_OFFSET(dva, offset);
 		DVA_SET_ASIZE(dva, size);
 		dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, B_FALSE,
 		    tx);
 	}
 }
 
 int
 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
     boolean_t async)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	int used = bp_get_dsize_sync(spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 
 	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
 		return (0);
 
 	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(bp->blk_birth <= tx->tx_txg);
+	ASSERT(BP_GET_LOGICAL_BIRTH(bp) <= tx->tx_txg);
 
 	if (ds == NULL) {
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
 		dsl_pool_mos_diduse_space(tx->tx_pool,
 		    -used, -compressed, -uncompressed);
 		return (used);
 	}
 	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
 
 	ASSERT(!ds->ds_is_snapshot);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
 	/*
 	 * Track block for livelist, but ignore embedded blocks because
 	 * they do not need to be freed.
 	 */
 	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-	    bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+	    BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
 	    !(BP_IS_EMBEDDED(bp))) {
 		ASSERT(dsl_dir_is_clone(ds->ds_dir));
 		ASSERT(spa_feature_is_enabled(spa,
 		    SPA_FEATURE_LIVELIST));
 		bplist_append(&ds->ds_dir->dd_pending_frees, bp);
 	}
 
-	if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+	if (BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
 		int64_t delta;
 
 		dprintf_bp(bp, "freeing ds=%llu", (u_longlong_t)ds->ds_object);
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
 
 		mutex_enter(&ds->ds_lock);
 		ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
 		    !DS_UNIQUE_IS_ACCURATE(ds));
 		delta = parent_delta(ds, -used);
 		dsl_dataset_phys(ds)->ds_unique_bytes -= used;
 		mutex_exit(&ds->ds_lock);
 		dsl_dir_diduse_transfer_space(ds->ds_dir,
 		    delta, -compressed, -uncompressed, -used,
 		    DD_USED_REFRSRV, DD_USED_HEAD, tx);
 	} else {
 		dprintf_bp(bp, "putting on dead list: %s", "");
 		if (async) {
 			/*
 			 * We are here as part of zio's write done callback,
 			 * which means we're a zio interrupt thread.  We can't
 			 * call dsl_deadlist_insert() now because it may block
 			 * waiting for I/O.  Instead, put bp on the deferred
 			 * queue and let dsl_pool_sync() finish the job.
 			 */
 			bplist_append(&ds->ds_pending_deadlist, bp);
 		} else {
 			dsl_deadlist_insert(&ds->ds_deadlist, bp, B_FALSE, tx);
 		}
 		ASSERT3U(ds->ds_prev->ds_object, ==,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj);
 		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
-		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
+		/* if (logical birth > prev prev snap txg) prev unique += bs */
 		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
-		    ds->ds_object && bp->blk_birth >
+		    ds->ds_object && BP_GET_LOGICAL_BIRTH(bp) >
 		    dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 			mutex_enter(&ds->ds_prev->ds_lock);
 			dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
 			mutex_exit(&ds->ds_prev->ds_lock);
 		}
-		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+		if (BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg) {
 			dsl_dir_transfer_space(ds->ds_dir, used,
 			    DD_USED_HEAD, DD_USED_SNAP, tx);
 		}
 	}
 
 	dsl_bookmark_block_killed(ds, bp, tx);
 
 	mutex_enter(&ds->ds_lock);
 	ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
 	dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
 	ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
 	dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
 	ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
 	dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
 	mutex_exit(&ds->ds_lock);
 
 	return (used);
 }
 
 struct feature_type_uint64_array_arg {
 	uint64_t length;
 	uint64_t *array;
 };
 
 static void
 unload_zfeature(dsl_dataset_t *ds, spa_feature_t f)
 {
 	switch (spa_feature_table[f].fi_type) {
 	case ZFEATURE_TYPE_BOOLEAN:
 		break;
 	case ZFEATURE_TYPE_UINT64_ARRAY:
 	{
 		struct feature_type_uint64_array_arg *ftuaa = ds->ds_feature[f];
 		kmem_free(ftuaa->array, ftuaa->length * sizeof (uint64_t));
 		kmem_free(ftuaa, sizeof (*ftuaa));
 		break;
 	}
 	default:
 		panic("Invalid zfeature type %d", spa_feature_table[f].fi_type);
 	}
 }
 
 static int
 load_zfeature(objset_t *mos, dsl_dataset_t *ds, spa_feature_t f)
 {
 	int err = 0;
 	switch (spa_feature_table[f].fi_type) {
 	case ZFEATURE_TYPE_BOOLEAN:
 		err = zap_contains(mos, ds->ds_object,
 		    spa_feature_table[f].fi_guid);
 		if (err == 0) {
 			ds->ds_feature[f] = (void *)B_TRUE;
 		} else {
 			ASSERT3U(err, ==, ENOENT);
 			err = 0;
 		}
 		break;
 	case ZFEATURE_TYPE_UINT64_ARRAY:
 	{
 		uint64_t int_size, num_int;
 		uint64_t *data;
 		err = zap_length(mos, ds->ds_object,
 		    spa_feature_table[f].fi_guid, &int_size, &num_int);
 		if (err != 0) {
 			ASSERT3U(err, ==, ENOENT);
 			err = 0;
 			break;
 		}
 		ASSERT3U(int_size, ==, sizeof (uint64_t));
 		data = kmem_alloc(int_size * num_int, KM_SLEEP);
 		VERIFY0(zap_lookup(mos, ds->ds_object,
 		    spa_feature_table[f].fi_guid, int_size, num_int, data));
 		struct feature_type_uint64_array_arg *ftuaa =
 		    kmem_alloc(sizeof (*ftuaa), KM_SLEEP);
 		ftuaa->length = num_int;
 		ftuaa->array = data;
 		ds->ds_feature[f] = ftuaa;
 		break;
 	}
 	default:
 		panic("Invalid zfeature type %d", spa_feature_table[f].fi_type);
 	}
 	return (err);
 }
 
 /*
  * We have to release the fsid synchronously or we risk that a subsequent
  * mount of the same dataset will fail to unique_insert the fsid.  This
  * failure would manifest itself as the fsid of this dataset changing
  * between mounts which makes NFS clients quite unhappy.
  */
 static void
 dsl_dataset_evict_sync(void *dbu)
 {
 	dsl_dataset_t *ds = dbu;
 
 	ASSERT(ds->ds_owner == NULL);
 
 	unique_remove(ds->ds_fsid_guid);
 }
 
 static void
 dsl_dataset_evict_async(void *dbu)
 {
 	dsl_dataset_t *ds = dbu;
 
 	ASSERT(ds->ds_owner == NULL);
 
 	ds->ds_dbuf = NULL;
 
 	if (ds->ds_objset != NULL)
 		dmu_objset_evict(ds->ds_objset);
 
 	if (ds->ds_prev) {
 		dsl_dataset_rele(ds->ds_prev, ds);
 		ds->ds_prev = NULL;
 	}
 
 	dsl_bookmark_fini_ds(ds);
 
 	bplist_destroy(&ds->ds_pending_deadlist);
 	if (dsl_deadlist_is_open(&ds->ds_deadlist))
 		dsl_deadlist_close(&ds->ds_deadlist);
 	if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
 		dsl_deadlist_close(&ds->ds_remap_deadlist);
 	if (ds->ds_dir)
 		dsl_dir_async_rele(ds->ds_dir, ds);
 
 	ASSERT(!list_link_active(&ds->ds_synced_link));
 
 	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 		if (dsl_dataset_feature_is_active(ds, f))
 			unload_zfeature(ds, f);
 	}
 
 	list_destroy(&ds->ds_prop_cbs);
 	mutex_destroy(&ds->ds_lock);
 	mutex_destroy(&ds->ds_opening_lock);
 	mutex_destroy(&ds->ds_sendstream_lock);
 	mutex_destroy(&ds->ds_remap_deadlist_lock);
 	zfs_refcount_destroy(&ds->ds_longholds);
 	rrw_destroy(&ds->ds_bp_rwlock);
 
 	kmem_free(ds, sizeof (dsl_dataset_t));
 }
 
 int
 dsl_dataset_get_snapname(dsl_dataset_t *ds)
 {
 	dsl_dataset_phys_t *headphys;
 	int err;
 	dmu_buf_t *headdbuf;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 
 	if (ds->ds_snapname[0])
 		return (0);
 	if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
 		return (0);
 
 	err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
 	    FTAG, &headdbuf);
 	if (err != 0)
 		return (err);
 	headphys = headdbuf->db_data;
 	err = zap_value_search(dp->dp_meta_objset,
 	    headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
 	if (err != 0 && zfs_recover == B_TRUE) {
 		err = 0;
 		(void) snprintf(ds->ds_snapname, sizeof (ds->ds_snapname),
 		    "SNAPOBJ=%llu-ERR=%d",
 		    (unsigned long long)ds->ds_object, err);
 	}
 	dmu_buf_rele(headdbuf, FTAG);
 	return (err);
 }
 
 int
 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 	matchtype_t mt = 0;
 	int err;
 
 	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_NORMALIZE;
 
 	err = zap_lookup_norm(mos, snapobj, name, 8, 1,
 	    value, mt, NULL, 0, NULL);
 	if (err == ENOTSUP && (mt & MT_NORMALIZE))
 		err = zap_lookup(mos, snapobj, name, 8, 1, value);
 	return (err);
 }
 
 int
 dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
     boolean_t adj_cnt)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 	matchtype_t mt = 0;
 	int err;
 
 	dsl_dir_snap_cmtime_update(ds->ds_dir, tx);
 
 	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_NORMALIZE;
 
 	err = zap_remove_norm(mos, snapobj, name, mt, tx);
 	if (err == ENOTSUP && (mt & MT_NORMALIZE))
 		err = zap_remove(mos, snapobj, name, tx);
 
 	if (err == 0 && adj_cnt)
 		dsl_fs_ss_count_adjust(ds->ds_dir, -1,
 		    DD_FIELD_SNAPSHOT_COUNT, tx);
 
 	return (err);
 }
 
 boolean_t
 dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, const void *tag)
 {
 	dmu_buf_t *dbuf = ds->ds_dbuf;
 	boolean_t result = B_FALSE;
 
 	if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
 	    ds->ds_object, DMU_BONUS_BLKID, tag)) {
 
 		if (ds == dmu_buf_get_user(dbuf))
 			result = B_TRUE;
 		else
 			dmu_buf_rele(dbuf, tag);
 	}
 
 	return (result);
 }
 
 int
 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, const void *tag,
     dsl_dataset_t **dsp)
 {
 	objset_t *mos = dp->dp_meta_objset;
 	dmu_buf_t *dbuf;
 	dsl_dataset_t *ds;
 	int err;
 	dmu_object_info_t doi;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
 	if (err != 0)
 		return (err);
 
 	/* Make sure dsobj has the correct object type. */
 	dmu_object_info_from_db(dbuf, &doi);
 	if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
 		dmu_buf_rele(dbuf, tag);
 		return (SET_ERROR(EINVAL));
 	}
 
 	ds = dmu_buf_get_user(dbuf);
 	if (ds == NULL) {
 		dsl_dataset_t *winner = NULL;
 
 		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
 		ds->ds_dbuf = dbuf;
 		ds->ds_object = dsobj;
 		ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
 		list_link_init(&ds->ds_synced_link);
 
 		err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj,
 		    NULL, ds, &ds->ds_dir);
 		if (err != 0) {
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			dmu_buf_rele(dbuf, tag);
 			return (err);
 		}
 
 		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_remap_deadlist_lock,
 		    NULL, MUTEX_DEFAULT, NULL);
 		rrw_init(&ds->ds_bp_rwlock, B_FALSE);
 		zfs_refcount_create(&ds->ds_longholds);
 
 		bplist_create(&ds->ds_pending_deadlist);
 
 		list_create(&ds->ds_sendstreams, sizeof (dmu_sendstatus_t),
 		    offsetof(dmu_sendstatus_t, dss_link));
 
 		list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
 		    offsetof(dsl_prop_cb_record_t, cbr_ds_node));
 
 		if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
 			spa_feature_t f;
 
 			for (f = 0; f < SPA_FEATURES; f++) {
 				if (!(spa_feature_table[f].fi_flags &
 				    ZFEATURE_FLAG_PER_DATASET))
 					continue;
 				err = load_zfeature(mos, ds, f);
 			}
 		}
 
 		if (!ds->ds_is_snapshot) {
 			ds->ds_snapname[0] = '\0';
 			if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 				err = dsl_dataset_hold_obj(dp,
 				    dsl_dataset_phys(ds)->ds_prev_snap_obj,
 				    ds, &ds->ds_prev);
 			}
 			if (err != 0)
 				goto after_dsl_bookmark_fini;
 			err = dsl_bookmark_init_ds(ds);
 		} else {
 			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
 				err = dsl_dataset_get_snapname(ds);
 			if (err == 0 &&
 			    dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
 				err = zap_count(
 				    ds->ds_dir->dd_pool->dp_meta_objset,
 				    dsl_dataset_phys(ds)->ds_userrefs_obj,
 				    &ds->ds_userrefs);
 			}
 		}
 
 		if (err == 0 && !ds->ds_is_snapshot) {
 			err = dsl_prop_get_int_ds(ds,
 			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
 			    &ds->ds_reserved);
 			if (err == 0) {
 				err = dsl_prop_get_int_ds(ds,
 				    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
 				    &ds->ds_quota);
 			}
 		} else {
 			ds->ds_reserved = ds->ds_quota = 0;
 		}
 
 		if (err == 0 && ds->ds_dir->dd_crypto_obj != 0 &&
 		    ds->ds_is_snapshot &&
 		    zap_contains(mos, dsobj, DS_FIELD_IVSET_GUID) != 0) {
 			dp->dp_spa->spa_errata =
 			    ZPOOL_ERRATA_ZOL_8308_ENCRYPTION;
 		}
 
 		dsl_deadlist_open(&ds->ds_deadlist,
 		    mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
 		uint64_t remap_deadlist_obj =
 		    dsl_dataset_get_remap_deadlist_object(ds);
 		if (remap_deadlist_obj != 0) {
 			dsl_deadlist_open(&ds->ds_remap_deadlist, mos,
 			    remap_deadlist_obj);
 		}
 
 		dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync,
 		    dsl_dataset_evict_async, &ds->ds_dbuf);
 		if (err == 0)
 			winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
 
 		if (err != 0 || winner != NULL) {
 			dsl_deadlist_close(&ds->ds_deadlist);
 			if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
 				dsl_deadlist_close(&ds->ds_remap_deadlist);
 			dsl_bookmark_fini_ds(ds);
 after_dsl_bookmark_fini:
 			if (ds->ds_prev)
 				dsl_dataset_rele(ds->ds_prev, ds);
 			dsl_dir_rele(ds->ds_dir, ds);
 			for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 				if (dsl_dataset_feature_is_active(ds, f))
 					unload_zfeature(ds, f);
 			}
 
 			list_destroy(&ds->ds_prop_cbs);
 			list_destroy(&ds->ds_sendstreams);
 			bplist_destroy(&ds->ds_pending_deadlist);
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_opening_lock);
 			mutex_destroy(&ds->ds_sendstream_lock);
 			mutex_destroy(&ds->ds_remap_deadlist_lock);
 			zfs_refcount_destroy(&ds->ds_longholds);
 			rrw_destroy(&ds->ds_bp_rwlock);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			if (err != 0) {
 				dmu_buf_rele(dbuf, tag);
 				return (err);
 			}
 			ds = winner;
 		} else {
 			ds->ds_fsid_guid =
 			    unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
 			if (ds->ds_fsid_guid !=
 			    dsl_dataset_phys(ds)->ds_fsid_guid) {
 				zfs_dbgmsg("ds_fsid_guid changed from "
 				    "%llx to %llx for pool %s dataset id %llu",
 				    (long long)
 				    dsl_dataset_phys(ds)->ds_fsid_guid,
 				    (long long)ds->ds_fsid_guid,
 				    spa_name(dp->dp_spa),
 				    (u_longlong_t)dsobj);
 			}
 		}
 	}
 
 	ASSERT3P(ds->ds_dbuf, ==, dbuf);
 	ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
 	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
 	    spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
 	    dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
 	*dsp = ds;
 
 	return (0);
 }
 
 int
 dsl_dataset_create_key_mapping(dsl_dataset_t *ds)
 {
 	dsl_dir_t *dd = ds->ds_dir;
 
 	if (dd->dd_crypto_obj == 0)
 		return (0);
 
 	return (spa_keystore_create_mapping(dd->dd_pool->dp_spa,
 	    ds, ds, &ds->ds_key_mapping));
 }
 
 int
 dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj,
     ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp)
 {
 	int err;
 
 	err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
 	if (err != 0)
 		return (err);
 
 	ASSERT3P(*dsp, !=, NULL);
 
 	if (flags & DS_HOLD_FLAG_DECRYPT) {
 		err = dsl_dataset_create_key_mapping(*dsp);
 		if (err != 0)
 			dsl_dataset_rele(*dsp, tag);
 	}
 
 	return (err);
 }
 
 int
 dsl_dataset_hold_flags(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
     const void *tag, dsl_dataset_t **dsp)
 {
 	dsl_dir_t *dd;
 	const char *snapname;
 	uint64_t obj;
 	int err = 0;
 	dsl_dataset_t *ds;
 
 	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
 	if (err != 0)
 		return (err);
 
 	ASSERT(dsl_pool_config_held(dp));
 	obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
 	if (obj != 0)
 		err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag, &ds);
 	else
 		err = SET_ERROR(ENOENT);
 
 	/* we may be looking for a snapshot */
 	if (err == 0 && snapname != NULL) {
 		dsl_dataset_t *snap_ds;
 
 		if (*snapname++ != '@') {
 			dsl_dataset_rele_flags(ds, flags, tag);
 			dsl_dir_rele(dd, FTAG);
 			return (SET_ERROR(ENOENT));
 		}
 
 		dprintf("looking for snapshot '%s'\n", snapname);
 		err = dsl_dataset_snap_lookup(ds, snapname, &obj);
 		if (err == 0) {
 			err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag,
 			    &snap_ds);
 		}
 		dsl_dataset_rele_flags(ds, flags, tag);
 
 		if (err == 0) {
 			mutex_enter(&snap_ds->ds_lock);
 			if (snap_ds->ds_snapname[0] == 0)
 				(void) strlcpy(snap_ds->ds_snapname, snapname,
 				    sizeof (snap_ds->ds_snapname));
 			mutex_exit(&snap_ds->ds_lock);
 			ds = snap_ds;
 		}
 	}
 	if (err == 0)
 		*dsp = ds;
 	dsl_dir_rele(dd, FTAG);
 	return (err);
 }
 
 int
 dsl_dataset_hold(dsl_pool_t *dp, const char *name, const void *tag,
     dsl_dataset_t **dsp)
 {
 	return (dsl_dataset_hold_flags(dp, name, 0, tag, dsp));
 }
 
 static int
 dsl_dataset_own_obj_impl(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
     const void *tag, boolean_t override, dsl_dataset_t **dsp)
 {
 	int err = dsl_dataset_hold_obj_flags(dp, dsobj, flags, tag, dsp);
 	if (err != 0)
 		return (err);
 	if (!dsl_dataset_tryown(*dsp, tag, override)) {
 		dsl_dataset_rele_flags(*dsp, flags, tag);
 		*dsp = NULL;
 		return (SET_ERROR(EBUSY));
 	}
 	return (0);
 }
 
 
 int
 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
     const void *tag, dsl_dataset_t **dsp)
 {
 	return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_FALSE, dsp));
 }
 
 int
 dsl_dataset_own_obj_force(dsl_pool_t *dp, uint64_t dsobj,
     ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp)
 {
 	return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_TRUE, dsp));
 }
 
 static int
 dsl_dataset_own_impl(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
     const void *tag, boolean_t override, dsl_dataset_t **dsp)
 {
 	int err = dsl_dataset_hold_flags(dp, name, flags, tag, dsp);
 	if (err != 0)
 		return (err);
 	if (!dsl_dataset_tryown(*dsp, tag, override)) {
 		dsl_dataset_rele_flags(*dsp, flags, tag);
 		return (SET_ERROR(EBUSY));
 	}
 	return (0);
 }
 
 int
 dsl_dataset_own_force(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
     const void *tag, dsl_dataset_t **dsp)
 {
 	return (dsl_dataset_own_impl(dp, name, flags, tag, B_TRUE, dsp));
 }
 
 int
 dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
     const void *tag, dsl_dataset_t **dsp)
 {
 	return (dsl_dataset_own_impl(dp, name, flags, tag, B_FALSE, dsp));
 }
 
 /*
  * See the comment above dsl_pool_hold() for details.  In summary, a long
  * hold is used to prevent destruction of a dataset while the pool hold
  * is dropped, allowing other concurrent operations (e.g. spa_sync()).
  *
  * The dataset and pool must be held when this function is called.  After it
  * is called, the pool hold may be released while the dataset is still held
  * and accessed.
  */
 void
 dsl_dataset_long_hold(dsl_dataset_t *ds, const void *tag)
 {
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 	(void) zfs_refcount_add(&ds->ds_longholds, tag);
 }
 
 void
 dsl_dataset_long_rele(dsl_dataset_t *ds, const void *tag)
 {
 	(void) zfs_refcount_remove(&ds->ds_longholds, tag);
 }
 
 /* Return B_TRUE if there are any long holds on this dataset. */
 boolean_t
 dsl_dataset_long_held(dsl_dataset_t *ds)
 {
 	return (!zfs_refcount_is_zero(&ds->ds_longholds));
 }
 
 void
 dsl_dataset_name(dsl_dataset_t *ds, char *name)
 {
 	if (ds == NULL) {
 		(void) strlcpy(name, "mos", ZFS_MAX_DATASET_NAME_LEN);
 	} else {
 		dsl_dir_name(ds->ds_dir, name);
 		VERIFY0(dsl_dataset_get_snapname(ds));
 		if (ds->ds_snapname[0]) {
 			VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN),
 			    <, ZFS_MAX_DATASET_NAME_LEN);
 			/*
 			 * We use a "recursive" mutex so that we
 			 * can call dprintf_ds() with ds_lock held.
 			 */
 			if (!MUTEX_HELD(&ds->ds_lock)) {
 				mutex_enter(&ds->ds_lock);
 				VERIFY3U(strlcat(name, ds->ds_snapname,
 				    ZFS_MAX_DATASET_NAME_LEN), <,
 				    ZFS_MAX_DATASET_NAME_LEN);
 				mutex_exit(&ds->ds_lock);
 			} else {
 				VERIFY3U(strlcat(name, ds->ds_snapname,
 				    ZFS_MAX_DATASET_NAME_LEN), <,
 				    ZFS_MAX_DATASET_NAME_LEN);
 			}
 		}
 	}
 }
 
 int
 dsl_dataset_namelen(dsl_dataset_t *ds)
 {
 	VERIFY0(dsl_dataset_get_snapname(ds));
 	mutex_enter(&ds->ds_lock);
 	int len = strlen(ds->ds_snapname);
 	mutex_exit(&ds->ds_lock);
 	/* add '@' if ds is a snap */
 	if (len > 0)
 		len++;
 	len += dsl_dir_namelen(ds->ds_dir);
 	return (len);
 }
 
 void
 dsl_dataset_rele(dsl_dataset_t *ds, const void *tag)
 {
 	dmu_buf_rele(ds->ds_dbuf, tag);
 }
 
 void
 dsl_dataset_remove_key_mapping(dsl_dataset_t *ds)
 {
 	dsl_dir_t *dd = ds->ds_dir;
 
 	if (dd == NULL || dd->dd_crypto_obj == 0)
 		return;
 
 	(void) spa_keystore_remove_mapping(dd->dd_pool->dp_spa,
 	    ds->ds_object, ds);
 }
 
 void
 dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags,
     const void *tag)
 {
 	if (flags & DS_HOLD_FLAG_DECRYPT)
 		dsl_dataset_remove_key_mapping(ds);
 
 	dsl_dataset_rele(ds, tag);
 }
 
 void
 dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, const void *tag)
 {
 	ASSERT3P(ds->ds_owner, ==, tag);
 	ASSERT(ds->ds_dbuf != NULL);
 
 	mutex_enter(&ds->ds_lock);
 	ds->ds_owner = NULL;
 	mutex_exit(&ds->ds_lock);
 	dsl_dataset_long_rele(ds, tag);
 	dsl_dataset_rele_flags(ds, flags, tag);
 }
 
 boolean_t
 dsl_dataset_tryown(dsl_dataset_t *ds, const void *tag, boolean_t override)
 {
 	boolean_t gotit = FALSE;
 
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 	mutex_enter(&ds->ds_lock);
 	if (ds->ds_owner == NULL && (override || !(DS_IS_INCONSISTENT(ds) ||
 	    (dsl_dataset_feature_is_active(ds,
 	    SPA_FEATURE_REDACTED_DATASETS) &&
 	    !zfs_allow_redacted_dataset_mount)))) {
 		ds->ds_owner = tag;
 		dsl_dataset_long_hold(ds, tag);
 		gotit = TRUE;
 	}
 	mutex_exit(&ds->ds_lock);
 	return (gotit);
 }
 
 boolean_t
 dsl_dataset_has_owner(dsl_dataset_t *ds)
 {
 	boolean_t rv;
 	mutex_enter(&ds->ds_lock);
 	rv = (ds->ds_owner != NULL);
 	mutex_exit(&ds->ds_lock);
 	return (rv);
 }
 
 static boolean_t
 zfeature_active(spa_feature_t f, void *arg)
 {
 	switch (spa_feature_table[f].fi_type) {
 	case ZFEATURE_TYPE_BOOLEAN: {
 		boolean_t val = (boolean_t)(uintptr_t)arg;
 		ASSERT(val == B_FALSE || val == B_TRUE);
 		return (val);
 	}
 	case ZFEATURE_TYPE_UINT64_ARRAY:
 		/*
 		 * In this case, arg is a uint64_t array.  The feature is active
 		 * if the array is non-null.
 		 */
 		return (arg != NULL);
 	default:
 		panic("Invalid zfeature type %d", spa_feature_table[f].fi_type);
 		return (B_FALSE);
 	}
 }
 
 boolean_t
 dsl_dataset_feature_is_active(dsl_dataset_t *ds, spa_feature_t f)
 {
 	return (zfeature_active(f, ds->ds_feature[f]));
 }
 
 /*
  * The buffers passed out by this function are references to internal buffers;
  * they should not be freed by callers of this function, and they should not be
  * used after the dataset has been released.
  */
 boolean_t
 dsl_dataset_get_uint64_array_feature(dsl_dataset_t *ds, spa_feature_t f,
     uint64_t *outlength, uint64_t **outp)
 {
 	VERIFY(spa_feature_table[f].fi_type & ZFEATURE_TYPE_UINT64_ARRAY);
 	if (!dsl_dataset_feature_is_active(ds, f)) {
 		return (B_FALSE);
 	}
 	struct feature_type_uint64_array_arg *ftuaa = ds->ds_feature[f];
 	*outp = ftuaa->array;
 	*outlength = ftuaa->length;
 	return (B_TRUE);
 }
 
 void
 dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, void *arg,
     dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
 	uint64_t zero = 0;
 
 	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
 
 	spa_feature_incr(spa, f, tx);
 	dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
 
 	switch (spa_feature_table[f].fi_type) {
 	case ZFEATURE_TYPE_BOOLEAN:
 		ASSERT3S((boolean_t)(uintptr_t)arg, ==, B_TRUE);
 		VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
 		    sizeof (zero), 1, &zero, tx));
 		break;
 	case ZFEATURE_TYPE_UINT64_ARRAY:
 	{
 		struct feature_type_uint64_array_arg *ftuaa = arg;
 		VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
 		    sizeof (uint64_t), ftuaa->length, ftuaa->array, tx));
 		break;
 	}
 	default:
 		panic("Invalid zfeature type %d", spa_feature_table[f].fi_type);
 	}
 }
 
 static void
 dsl_dataset_deactivate_feature_impl(dsl_dataset_t *ds, spa_feature_t f,
     dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
 	uint64_t dsobj = ds->ds_object;
 
 	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
 
 	VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
 	spa_feature_decr(spa, f, tx);
 	ds->ds_feature[f] = NULL;
 }
 
 void
 dsl_dataset_deactivate_feature(dsl_dataset_t *ds, spa_feature_t f, dmu_tx_t *tx)
 {
 	unload_zfeature(ds, f);
 	dsl_dataset_deactivate_feature_impl(ds, f, tx);
 }
 
 uint64_t
 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
     dsl_crypto_params_t *dcp, uint64_t flags, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dd->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
 	uint64_t dsobj;
 	objset_t *mos = dp->dp_meta_objset;
 
 	if (origin == NULL)
 		origin = dp->dp_origin_snap;
 
 	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
 	ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	memset(dsphys, 0, sizeof (dsl_dataset_phys_t));
 	dsphys->ds_dir_obj = dd->dd_object;
 	dsphys->ds_flags = flags;
 	dsphys->ds_fsid_guid = unique_create();
 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 	    sizeof (dsphys->ds_guid));
 	dsphys->ds_snapnames_zapobj =
 	    zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
 	    DMU_OT_NONE, 0, tx);
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
 
 	if (origin == NULL) {
 		dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
 	} else {
 		dsl_dataset_t *ohds; /* head of the origin snapshot */
 
 		dsphys->ds_prev_snap_obj = origin->ds_object;
 		dsphys->ds_prev_snap_txg =
 		    dsl_dataset_phys(origin)->ds_creation_txg;
 		dsphys->ds_referenced_bytes =
 		    dsl_dataset_phys(origin)->ds_referenced_bytes;
 		dsphys->ds_compressed_bytes =
 		    dsl_dataset_phys(origin)->ds_compressed_bytes;
 		dsphys->ds_uncompressed_bytes =
 		    dsl_dataset_phys(origin)->ds_uncompressed_bytes;
 		rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG);
 		dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
 		rrw_exit(&origin->ds_bp_rwlock, FTAG);
 
 		/*
 		 * Inherit flags that describe the dataset's contents
 		 * (INCONSISTENT) or properties (Case Insensitive).
 		 */
 		dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
 		    (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
 
 		for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 			if (zfeature_active(f, origin->ds_feature[f])) {
 				dsl_dataset_activate_feature(dsobj, f,
 				    origin->ds_feature[f], tx);
 			}
 		}
 
 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
 		dsl_dataset_phys(origin)->ds_num_children++;
 
 		VERIFY0(dsl_dataset_hold_obj(dp,
 		    dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
 		    FTAG, &ohds));
 		dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
 		    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
 		dsl_dataset_rele(ohds, FTAG);
 
 		if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
 			if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
 				dsl_dataset_phys(origin)->ds_next_clones_obj =
 				    zap_create(mos,
 				    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 			}
 			VERIFY0(zap_add_int(mos,
 			    dsl_dataset_phys(origin)->ds_next_clones_obj,
 			    dsobj, tx));
 		}
 
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
 		dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
 		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 			if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
 				dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 				dsl_dir_phys(origin->ds_dir)->dd_clones =
 				    zap_create(mos,
 				    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 			}
 			VERIFY0(zap_add_int(mos,
 			    dsl_dir_phys(origin->ds_dir)->dd_clones,
 			    dsobj, tx));
 		}
 	}
 
 	/* handle encryption */
 	dsl_dataset_create_crypt_sync(dsobj, dd, origin, dcp, tx);
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
 		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 
 	dmu_buf_rele(dbuf, FTAG);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
 
 	return (dsobj);
 }
 
 static void
 dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	objset_t *os;
 
 	VERIFY0(dmu_objset_from_ds(ds, &os));
 	if (memcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) {
 		dsl_pool_t *dp = ds->ds_dir->dd_pool;
 		zio_t *zio;
 
 		memset(&os->os_zil_header, 0, sizeof (os->os_zil_header));
 		if (os->os_encrypted)
 			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
 
 		zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 		dsl_dataset_sync(ds, zio, tx);
 		VERIFY0(zio_wait(zio));
 		dsl_dataset_sync_done(ds, tx);
 	}
 }
 
 uint64_t
 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
     dsl_dataset_t *origin, uint64_t flags, cred_t *cr,
     dsl_crypto_params_t *dcp, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = pdd->dd_pool;
 	uint64_t dsobj, ddobj;
 	dsl_dir_t *dd;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(lastname[0] != '@');
 	/*
 	 * Filesystems will eventually have their origin set to dp_origin_snap,
 	 * but that's taken care of in dsl_dataset_create_sync_dd. When
 	 * creating a filesystem, this function is called with origin equal to
 	 * NULL.
 	 */
 	if (origin != NULL)
 		ASSERT3P(origin, !=, dp->dp_origin_snap);
 
 	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
 	VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
 
 	dsobj = dsl_dataset_create_sync_dd(dd, origin, dcp,
 	    flags & ~DS_CREATE_FLAG_NODIRTY, tx);
 
 	dsl_deleg_set_create_perms(dd, tx, cr);
 
 	/*
 	 * If we are creating a clone and the livelist feature is enabled,
 	 * add the entry DD_FIELD_LIVELIST to ZAP.
 	 */
 	if (origin != NULL &&
 	    spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LIVELIST)) {
 		objset_t *mos = dd->dd_pool->dp_meta_objset;
 		dsl_dir_zapify(dd, tx);
 		uint64_t obj = dsl_deadlist_alloc(mos, tx);
 		VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_LIVELIST,
 		    sizeof (uint64_t), 1, &obj, tx));
 		spa_feature_incr(dp->dp_spa, SPA_FEATURE_LIVELIST, tx);
 	}
 
 	/*
 	 * Since we're creating a new node we know it's a leaf, so we can
 	 * initialize the counts if the limit feature is active.
 	 */
 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
 		uint64_t cnt = 0;
 		objset_t *os = dd->dd_pool->dp_meta_objset;
 
 		dsl_dir_zapify(dd, tx);
 		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
 		    sizeof (cnt), 1, &cnt, tx));
 		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
 		    sizeof (cnt), 1, &cnt, tx));
 	}
 
 	dsl_dir_rele(dd, FTAG);
 
 	/*
 	 * If we are creating a clone, make sure we zero out any stale
 	 * data from the origin snapshots zil header.
 	 */
 	if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
 		dsl_dataset_t *ds;
 
 		VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 		dsl_dataset_zero_zil(ds, tx);
 		dsl_dataset_rele(ds, FTAG);
 	}
 
 	return (dsobj);
 }
 
 /*
  * The unique space in the head dataset can be calculated by subtracting
  * the space used in the most recent snapshot, that is still being used
  * in this file system, from the space currently in use.  To figure out
  * the space in the most recent snapshot still in use, we need to take
  * the total space used in the snapshot and subtract out the space that
  * has been freed up since the snapshot was taken.
  */
 void
 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
 {
 	uint64_t mrs_used;
 	uint64_t dlused, dlcomp, dluncomp;
 
 	ASSERT(!ds->ds_is_snapshot);
 
 	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
 		mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
 	else
 		mrs_used = 0;
 
 	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
 
 	ASSERT3U(dlused, <=, mrs_used);
 	dsl_dataset_phys(ds)->ds_unique_bytes =
 	    dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
 
 	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
 	    SPA_VERSION_UNIQUE_ACCURATE)
 		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 }
 
 void
 dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
     dmu_tx_t *tx)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t count __maybe_unused;
 	int err;
 
 	ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
 	err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
 	    obj, tx);
 	/*
 	 * The err should not be ENOENT, but a bug in a previous version
 	 * of the code could cause upgrade_clones_cb() to not set
 	 * ds_next_snap_obj when it should, leading to a missing entry.
 	 * If we knew that the pool was created after
 	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
 	 * ENOENT.  However, at least we can check that we don't have
 	 * too many entries in the next_clones_obj even after failing to
 	 * remove this one.
 	 */
 	if (err != ENOENT)
 		VERIFY0(err);
 	ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
 	    &count));
 	ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
 }
 
 
 blkptr_t *
 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
 {
 	return (&dsl_dataset_phys(ds)->ds_bp);
 }
 
 spa_t *
 dsl_dataset_get_spa(dsl_dataset_t *ds)
 {
 	return (ds->ds_dir->dd_pool->dp_spa);
 }
 
 void
 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp;
 
 	if (ds == NULL) /* this is the meta-objset */
 		return;
 
 	ASSERT(ds->ds_objset != NULL);
 
 	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
 		panic("dirtying snapshot!");
 
 	/* Must not dirty a dataset in the same txg where it got snapshotted. */
 	ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
 
 	dp = ds->ds_dir->dd_pool;
 	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
 		objset_t *os = ds->ds_objset;
 
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, ds);
 
 		/* if this dataset is encrypted, grab a reference to the DCK */
 		if (ds->ds_dir->dd_crypto_obj != 0 &&
 		    !os->os_raw_receive &&
 		    !os->os_next_write_raw[tx->tx_txg & TXG_MASK]) {
 			ASSERT3P(ds->ds_key_mapping, !=, NULL);
 			key_mapping_add_ref(ds->ds_key_mapping, ds);
 		}
 	}
 }
 
 static int
 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	uint64_t asize;
 
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	/*
 	 * If there's an fs-only reservation, any blocks that might become
 	 * owned by the snapshot dataset must be accommodated by space
 	 * outside of the reservation.
 	 */
 	ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
 	asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
 	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
 		return (SET_ERROR(ENOSPC));
 
 	/*
 	 * Propagate any reserved space for this snapshot to other
 	 * snapshot checks in this sync group.
 	 */
 	if (asize > 0)
 		dsl_dir_willuse_space(ds->ds_dir, asize, tx);
 
 	return (0);
 }
 
 int
 dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
     dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr, proc_t *proc)
 {
 	int error;
 	uint64_t value;
 
 	ds->ds_trysnap_txg = tx->tx_txg;
 
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	/*
 	 * We don't allow multiple snapshots of the same txg.  If there
 	 * is already one, try again.
 	 */
 	if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
 		return (SET_ERROR(EAGAIN));
 
 	/*
 	 * Check for conflicting snapshot name.
 	 */
 	error = dsl_dataset_snap_lookup(ds, snapname, &value);
 	if (error == 0)
 		return (SET_ERROR(EEXIST));
 	if (error != ENOENT)
 		return (error);
 
 	/*
 	 * We don't allow taking snapshots of inconsistent datasets, such as
 	 * those into which we are currently receiving.  However, if we are
 	 * creating this snapshot as part of a receive, this check will be
 	 * executed atomically with respect to the completion of the receive
 	 * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
 	 * case we ignore this, knowing it will be fixed up for us shortly in
 	 * dmu_recv_end_sync().
 	 */
 	if (!recv && DS_IS_INCONSISTENT(ds))
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * Skip the check for temporary snapshots or if we have already checked
 	 * the counts in dsl_dataset_snapshot_check. This means we really only
 	 * check the count here when we're receiving a stream.
 	 */
 	if (cnt != 0 && cr != NULL) {
 		error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
 		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr, proc);
 		if (error != 0)
 			return (error);
 	}
 
 	error = dsl_dataset_snapshot_reserve_space(ds, tx);
 	if (error != 0)
 		return (error);
 
 	return (0);
 }
 
 int
 dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_snapshot_arg_t *ddsa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	nvpair_t *pair;
 	int rv = 0;
 
 	/*
 	 * Pre-compute how many total new snapshots will be created for each
 	 * level in the tree and below. This is needed for validating the
 	 * snapshot limit when either taking a recursive snapshot or when
 	 * taking multiple snapshots.
 	 *
 	 * The problem is that the counts are not actually adjusted when
 	 * we are checking, only when we finally sync. For a single snapshot,
 	 * this is easy, the count will increase by 1 at each node up the tree,
 	 * but its more complicated for the recursive/multiple snapshot case.
 	 *
 	 * The dsl_fs_ss_limit_check function does recursively check the count
 	 * at each level up the tree but since it is validating each snapshot
 	 * independently we need to be sure that we are validating the complete
 	 * count for the entire set of snapshots. We do this by rolling up the
 	 * counts for each component of the name into an nvlist and then
 	 * checking each of those cases with the aggregated count.
 	 *
 	 * This approach properly handles not only the recursive snapshot
 	 * case (where we get all of those on the ddsa_snaps list) but also
 	 * the sibling case (e.g. snapshot a/b and a/c so that we will also
 	 * validate the limit on 'a' using a count of 2).
 	 *
 	 * We validate the snapshot names in the third loop and only report
 	 * name errors once.
 	 */
 	if (dmu_tx_is_syncing(tx)) {
 		char *nm;
 		nvlist_t *cnt_track = NULL;
 		cnt_track = fnvlist_alloc();
 
 		nm = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 		/* Rollup aggregated counts into the cnt_track list */
 		for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
 		    pair != NULL;
 		    pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
 			char *pdelim;
 			uint64_t val;
 
 			(void) strlcpy(nm, nvpair_name(pair), MAXPATHLEN);
 			pdelim = strchr(nm, '@');
 			if (pdelim == NULL)
 				continue;
 			*pdelim = '\0';
 
 			do {
 				if (nvlist_lookup_uint64(cnt_track, nm,
 				    &val) == 0) {
 					/* update existing entry */
 					fnvlist_add_uint64(cnt_track, nm,
 					    val + 1);
 				} else {
 					/* add to list */
 					fnvlist_add_uint64(cnt_track, nm, 1);
 				}
 
 				pdelim = strrchr(nm, '/');
 				if (pdelim != NULL)
 					*pdelim = '\0';
 			} while (pdelim != NULL);
 		}
 
 		kmem_free(nm, MAXPATHLEN);
 
 		/* Check aggregated counts at each level */
 		for (pair = nvlist_next_nvpair(cnt_track, NULL);
 		    pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
 			int error = 0;
 			const char *name;
 			uint64_t cnt = 0;
 			dsl_dataset_t *ds;
 
 			name = nvpair_name(pair);
 			cnt = fnvpair_value_uint64(pair);
 			ASSERT(cnt > 0);
 
 			error = dsl_dataset_hold(dp, name, FTAG, &ds);
 			if (error == 0) {
 				error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
 				    ZFS_PROP_SNAPSHOT_LIMIT, NULL,
 				    ddsa->ddsa_cr, ddsa->ddsa_proc);
 				dsl_dataset_rele(ds, FTAG);
 			}
 
 			if (error != 0) {
 				if (ddsa->ddsa_errors != NULL)
 					fnvlist_add_int32(ddsa->ddsa_errors,
 					    name, error);
 				rv = error;
 				/* only report one error for this check */
 				break;
 			}
 		}
 		nvlist_free(cnt_track);
 	}
 
 	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
 		int error = 0;
 		dsl_dataset_t *ds;
 		const char *name, *atp = NULL;
 		char dsname[ZFS_MAX_DATASET_NAME_LEN];
 
 		name = nvpair_name(pair);
 		if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN)
 			error = SET_ERROR(ENAMETOOLONG);
 		if (error == 0) {
 			atp = strchr(name, '@');
 			if (atp == NULL)
 				error = SET_ERROR(EINVAL);
 			if (error == 0)
 				(void) strlcpy(dsname, name, atp - name + 1);
 		}
 		if (error == 0)
 			error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
 		if (error == 0) {
 			/* passing 0/NULL skips dsl_fs_ss_limit_check */
 			error = dsl_dataset_snapshot_check_impl(ds,
 			    atp + 1, tx, B_FALSE, 0, NULL, NULL);
 			dsl_dataset_rele(ds, FTAG);
 		}
 
 		if (error != 0) {
 			if (ddsa->ddsa_errors != NULL) {
 				fnvlist_add_int32(ddsa->ddsa_errors,
 				    name, error);
 			}
 			rv = error;
 		}
 	}
 
 	return (rv);
 }
 
 void
 dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
     dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
 	uint64_t dsobj, crtxg;
 	objset_t *mos = dp->dp_meta_objset;
 	objset_t *os __maybe_unused;
 
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
 	/*
 	 * If we are on an old pool, the zil must not be active, in which
 	 * case it will be zeroed.  Usually zil_suspend() accomplishes this.
 	 */
 	ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
 	    dmu_objset_from_ds(ds, &os) != 0 ||
 	    memcmp(&os->os_phys->os_zil_header, &zero_zil,
 	    sizeof (zero_zil)) == 0);
 
 	/* Should not snapshot a dirty dataset. */
 	ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
 	    ds, tx->tx_txg));
 
 	dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
 
 	/*
 	 * The origin's ds_creation_txg has to be < TXG_INITIAL
 	 */
 	if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
 		crtxg = 1;
 	else
 		crtxg = tx->tx_txg;
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	memset(dsphys, 0, sizeof (dsl_dataset_phys_t));
 	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
 	dsphys->ds_fsid_guid = unique_create();
 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 	    sizeof (dsphys->ds_guid));
 	dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 	dsphys->ds_next_snap_obj = ds->ds_object;
 	dsphys->ds_num_children = 1;
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = crtxg;
 	dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
 	dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
 	dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
 	dsphys->ds_uncompressed_bytes =
 	    dsl_dataset_phys(ds)->ds_uncompressed_bytes;
 	dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	dmu_buf_rele(dbuf, FTAG);
 
 	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 		if (zfeature_active(f, ds->ds_feature[f])) {
 			dsl_dataset_activate_feature(dsobj, f,
 			    ds->ds_feature[f], tx);
 		}
 	}
 
 	ASSERT3U(ds->ds_prev != 0, ==,
 	    dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
 	if (ds->ds_prev) {
 		uint64_t next_clones_obj =
 		    dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
 		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
 		    ds->ds_object ||
 		    dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
 		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
 		    ds->ds_object) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 			ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
 			    dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
 			dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
 		} else if (next_clones_obj != 0) {
 			dsl_dataset_remove_from_next_clones(ds->ds_prev,
 			    dsphys->ds_next_snap_obj, tx);
 			VERIFY0(zap_add_int(mos,
 			    next_clones_obj, dsobj, tx));
 		}
 	}
 
 	/*
 	 * If we have a reference-reservation on this dataset, we will
 	 * need to increase the amount of refreservation being charged
 	 * since our unique space is going to zero.
 	 */
 	if (ds->ds_reserved) {
 		int64_t delta;
 		ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
 		delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
 		    ds->ds_reserved);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
 		    delta, 0, 0, tx);
 	}
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	dsl_dataset_phys(ds)->ds_deadlist_obj =
 	    dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
 	    dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
 	dsl_deadlist_close(&ds->ds_deadlist);
 	dsl_deadlist_open(&ds->ds_deadlist, mos,
 	    dsl_dataset_phys(ds)->ds_deadlist_obj);
 	dsl_deadlist_add_key(&ds->ds_deadlist,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
 	dsl_bookmark_snapshotted(ds, tx);
 
 	if (dsl_dataset_remap_deadlist_exists(ds)) {
 		uint64_t remap_deadlist_obj =
 		    dsl_dataset_get_remap_deadlist_object(ds);
 		/*
 		 * Move the remap_deadlist to the snapshot.  The head
 		 * will create a new remap deadlist on demand, from
 		 * dsl_dataset_block_remapped().
 		 */
 		dsl_dataset_unset_remap_deadlist_object(ds, tx);
 		dsl_deadlist_close(&ds->ds_remap_deadlist);
 
 		dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_REMAP_DEADLIST,
 		    sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx));
 	}
 
 	/*
 	 * Create a ivset guid for this snapshot if the dataset is
 	 * encrypted. This may be overridden by a raw receive. A
 	 * previous implementation of this code did not have this
 	 * field as part of the on-disk format for ZFS encryption
 	 * (see errata #4). As part of the remediation for this
 	 * issue, we ask the user to enable the bookmark_v2 feature
 	 * which is now a dependency of the encryption feature. We
 	 * use this as a heuristic to determine when the user has
 	 * elected to correct any datasets created with the old code.
 	 * As a result, we only do this step if the bookmark_v2
 	 * feature is enabled, which limits the number of states a
 	 * given pool / dataset can be in with regards to terms of
 	 * correcting the issue.
 	 */
 	if (ds->ds_dir->dd_crypto_obj != 0 &&
 	    spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2)) {
 		uint64_t ivset_guid = unique_create();
 
 		dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_IVSET_GUID,
 		    sizeof (ivset_guid), 1, &ivset_guid, tx));
 	}
 
 	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
 	dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
 	dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
 	dsl_dataset_phys(ds)->ds_unique_bytes = 0;
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
 		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 
 	VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
 	    snapname, 8, 1, &dsobj, tx));
 
 	if (ds->ds_prev)
 		dsl_dataset_rele(ds->ds_prev, ds);
 	VERIFY0(dsl_dataset_hold_obj(dp,
 	    dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
 
 	dsl_scan_ds_snapshotted(ds, tx);
 
 	dsl_dir_snap_cmtime_update(ds->ds_dir, tx);
 
 	if (zfs_snapshot_history_enabled)
 		spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, " ");
 }
 
 void
 dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_snapshot_arg_t *ddsa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	nvpair_t *pair;
 
 	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
 		dsl_dataset_t *ds;
 		const char *name, *atp;
 		char dsname[ZFS_MAX_DATASET_NAME_LEN];
 
 		name = nvpair_name(pair);
 		atp = strchr(name, '@');
 		(void) strlcpy(dsname, name, atp - name + 1);
 		VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
 
 		dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
 		if (ddsa->ddsa_props != NULL) {
 			dsl_props_set_sync_impl(ds->ds_prev,
 			    ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
 		}
 		dsl_dataset_rele(ds, FTAG);
 	}
 }
 
 /*
  * The snapshots must all be in the same pool.
  * All-or-nothing: if there are any failures, nothing will be modified.
  */
 int
 dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
 {
 	dsl_dataset_snapshot_arg_t ddsa;
 	nvpair_t *pair;
 	boolean_t needsuspend;
 	int error;
 	spa_t *spa;
 	const char *firstname;
 	nvlist_t *suspended = NULL;
 
 	pair = nvlist_next_nvpair(snaps, NULL);
 	if (pair == NULL)
 		return (0);
 	firstname = nvpair_name(pair);
 
 	error = spa_open(firstname, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
 	spa_close(spa, FTAG);
 
 	if (needsuspend) {
 		suspended = fnvlist_alloc();
 		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 		    pair = nvlist_next_nvpair(snaps, pair)) {
 			char fsname[ZFS_MAX_DATASET_NAME_LEN];
 			const char *snapname = nvpair_name(pair);
 			const char *atp;
 			void *cookie;
 
 			atp = strchr(snapname, '@');
 			if (atp == NULL) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 			(void) strlcpy(fsname, snapname, atp - snapname + 1);
 
 			error = zil_suspend(fsname, &cookie);
 			if (error != 0)
 				break;
 			fnvlist_add_uint64(suspended, fsname,
 			    (uintptr_t)cookie);
 		}
 	}
 
 	ddsa.ddsa_snaps = snaps;
 	ddsa.ddsa_props = props;
 	ddsa.ddsa_errors = errors;
 	ddsa.ddsa_cr = CRED();
 	ddsa.ddsa_proc = curproc;
 
 	if (error == 0) {
 		error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
 		    dsl_dataset_snapshot_sync, &ddsa,
 		    fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
 	}
 
 	if (suspended != NULL) {
 		for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
 		    pair = nvlist_next_nvpair(suspended, pair)) {
 			zil_resume((void *)(uintptr_t)
 			    fnvpair_value_uint64(pair));
 		}
 		fnvlist_free(suspended);
 	}
 
 	if (error == 0) {
 		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 		    pair = nvlist_next_nvpair(snaps, pair)) {
 			zvol_create_minor(nvpair_name(pair));
 		}
 	}
 
 	return (error);
 }
 
 typedef struct dsl_dataset_snapshot_tmp_arg {
 	const char *ddsta_fsname;
 	const char *ddsta_snapname;
 	minor_t ddsta_cleanup_minor;
 	const char *ddsta_htag;
 } dsl_dataset_snapshot_tmp_arg_t;
 
 static int
 dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	/* NULL cred means no limit check for tmp snapshot */
 	error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
 	    tx, B_FALSE, 0, NULL, NULL);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
 	    B_TRUE, tx);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 static void
 dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds = NULL;
 
 	VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
 
 	dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
 	dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
 	    ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
 	dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
 
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
     minor_t cleanup_minor, const char *htag)
 {
 	dsl_dataset_snapshot_tmp_arg_t ddsta;
 	int error;
 	spa_t *spa;
 	boolean_t needsuspend;
 	void *cookie;
 
 	ddsta.ddsta_fsname = fsname;
 	ddsta.ddsta_snapname = snapname;
 	ddsta.ddsta_cleanup_minor = cleanup_minor;
 	ddsta.ddsta_htag = htag;
 
 	error = spa_open(fsname, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
 	spa_close(spa, FTAG);
 
 	if (needsuspend) {
 		error = zil_suspend(fsname, &cookie);
 		if (error != 0)
 			return (error);
 	}
 
 	error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
 	    dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
 
 	if (needsuspend)
 		zil_resume(cookie);
 	return (error);
 }
 
 /* Nonblocking dataset sync. Assumes dataset:objset is always 1:1 */
 void
 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *rio, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(ds->ds_objset != NULL);
 	ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
 
 	/*
 	 * in case we had to change ds_fsid_guid when we opened it,
 	 * sync it out now.
 	 */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
 
 	if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
 		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
 		    ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
 		    &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
 		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
 		    ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
 		    &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
 		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
 		    ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
 		    &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
 		ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
 		ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
 		ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
 	}
 
 	dmu_objset_sync(ds->ds_objset, rio, tx);
 }
 
 /*
  * Check if the percentage of blocks shared between the clone and the
  * snapshot (as opposed to those that are clone only) is below a certain
  * threshold
  */
 static boolean_t
 dsl_livelist_should_disable(dsl_dataset_t *ds)
 {
 	uint64_t used, referenced;
 	int percent_shared;
 
 	used = dsl_dir_get_usedds(ds->ds_dir);
 	referenced = dsl_get_referenced(ds);
 	if (referenced == 0)
 		return (B_FALSE);
 	percent_shared = (100 * (referenced - used)) / referenced;
 	if (percent_shared <= zfs_livelist_min_percent_shared)
 		return (B_TRUE);
 	return (B_FALSE);
 }
 
 /*
  *  Check if it is possible to combine two livelist entries into one.
  *  This is the case if the combined number of 'live' blkptrs (ALLOCs that
  *  don't have a matching FREE) is under the maximum sublist size.
  *  We check this by subtracting twice the total number of frees from the total
  *  number of blkptrs. FREEs are counted twice because each FREE blkptr
  *  will cancel out an ALLOC blkptr when the livelist is processed.
  */
 static boolean_t
 dsl_livelist_should_condense(dsl_deadlist_entry_t *first,
     dsl_deadlist_entry_t *next)
 {
 	uint64_t total_free = first->dle_bpobj.bpo_phys->bpo_num_freed +
 	    next->dle_bpobj.bpo_phys->bpo_num_freed;
 	uint64_t total_entries = first->dle_bpobj.bpo_phys->bpo_num_blkptrs +
 	    next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
 	if ((total_entries - (2 * total_free)) < zfs_livelist_max_entries)
 		return (B_TRUE);
 	return (B_FALSE);
 }
 
 typedef struct try_condense_arg {
 	spa_t *spa;
 	dsl_dataset_t *ds;
 } try_condense_arg_t;
 
 /*
  * Iterate over the livelist entries, searching for a pair to condense.
  * A nonzero return value means stop, 0 means keep looking.
  */
 static int
 dsl_livelist_try_condense(void *arg, dsl_deadlist_entry_t *first)
 {
 	try_condense_arg_t *tca = arg;
 	spa_t *spa = tca->spa;
 	dsl_dataset_t *ds = tca->ds;
 	dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
 	dsl_deadlist_entry_t *next;
 
 	/* The condense thread has not yet been created at import */
 	if (spa->spa_livelist_condense_zthr == NULL)
 		return (1);
 
 	/* A condense is already in progress */
 	if (spa->spa_to_condense.ds != NULL)
 		return (1);
 
 	next = AVL_NEXT(&ll->dl_tree, &first->dle_node);
 	/* The livelist has only one entry - don't condense it */
 	if (next == NULL)
 		return (1);
 
 	/* Next is the newest entry - don't condense it */
 	if (AVL_NEXT(&ll->dl_tree, &next->dle_node) == NULL)
 		return (1);
 
 	/* This pair is not ready to condense but keep looking */
 	if (!dsl_livelist_should_condense(first, next))
 		return (0);
 
 	/*
 	 * Add a ref to prevent the dataset from being evicted while
 	 * the condense zthr or synctask are running. Ref will be
 	 * released at the end of the condense synctask
 	 */
 	dmu_buf_add_ref(ds->ds_dbuf, spa);
 
 	spa->spa_to_condense.ds = ds;
 	spa->spa_to_condense.first = first;
 	spa->spa_to_condense.next = next;
 	spa->spa_to_condense.syncing = B_FALSE;
 	spa->spa_to_condense.cancelled = B_FALSE;
 
 	zthr_wakeup(spa->spa_livelist_condense_zthr);
 	return (1);
 }
 
 static void
 dsl_flush_pending_livelist(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = ds->ds_dir;
 	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
 	dsl_deadlist_entry_t *last = dsl_deadlist_last(&dd->dd_livelist);
 
 	/* Check if we need to add a new sub-livelist */
 	if (last == NULL) {
 		/* The livelist is empty */
 		dsl_deadlist_add_key(&dd->dd_livelist,
 		    tx->tx_txg - 1, tx);
 	} else if (spa_sync_pass(spa) == 1) {
 		/*
 		 * Check if the newest entry is full. If it is, make a new one.
 		 * We only do this once per sync because we could overfill a
 		 * sublist in one sync pass and don't want to add another entry
 		 * for a txg that is already represented. This ensures that
 		 * blkptrs born in the same txg are stored in the same sublist.
 		 */
 		bpobj_t bpobj = last->dle_bpobj;
 		uint64_t all = bpobj.bpo_phys->bpo_num_blkptrs;
 		uint64_t free = bpobj.bpo_phys->bpo_num_freed;
 		uint64_t alloc = all - free;
 		if (alloc > zfs_livelist_max_entries) {
 			dsl_deadlist_add_key(&dd->dd_livelist,
 			    tx->tx_txg - 1, tx);
 		}
 	}
 
 	/* Insert each entry into the on-disk livelist */
 	bplist_iterate(&dd->dd_pending_allocs,
 	    dsl_deadlist_insert_alloc_cb, &dd->dd_livelist, tx);
 	bplist_iterate(&dd->dd_pending_frees,
 	    dsl_deadlist_insert_free_cb, &dd->dd_livelist, tx);
 
 	/* Attempt to condense every pair of adjacent entries */
 	try_condense_arg_t arg = {
 	    .spa = spa,
 	    .ds = ds
 	};
 	dsl_deadlist_iterate(&dd->dd_livelist, dsl_livelist_try_condense,
 	    &arg);
 }
 
 void
 dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	objset_t *os = ds->ds_objset;
 
 	bplist_iterate(&ds->ds_pending_deadlist,
 	    dsl_deadlist_insert_alloc_cb, &ds->ds_deadlist, tx);
 
 	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) {
 		dsl_flush_pending_livelist(ds, tx);
 		if (dsl_livelist_should_disable(ds)) {
 			dsl_dir_remove_livelist(ds->ds_dir, tx, B_TRUE);
 		}
 	}
 
 	dsl_bookmark_sync_done(ds, tx);
 
 	multilist_destroy(&os->os_synced_dnodes);
 
 	if (os->os_encrypted)
 		os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_FALSE;
 	else
 		ASSERT0(os->os_next_write_raw[tx->tx_txg & TXG_MASK]);
 
 	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 		if (zfeature_active(f,
 		    ds->ds_feature_activation[f])) {
 			if (zfeature_active(f, ds->ds_feature[f]))
 				continue;
 			dsl_dataset_activate_feature(ds->ds_object, f,
 			    ds->ds_feature_activation[f], tx);
 			ds->ds_feature[f] = ds->ds_feature_activation[f];
 		}
 	}
 
 	ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));
 }
 
 int
 get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val)
 {
 	uint64_t count = 0;
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 
 	/*
 	 * There may be missing entries in ds_next_clones_obj
 	 * due to a bug in a previous version of the code.
 	 * Only trust it if it has the right number of entries.
 	 */
 	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
 		VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
 		    &count));
 	}
 	if (count != dsl_dataset_phys(ds)->ds_num_children - 1) {
 		return (SET_ERROR(ENOENT));
 	}
 	for (zap_cursor_init(&zc, mos,
 	    dsl_dataset_phys(ds)->ds_next_clones_obj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_dataset_t *clone;
 		char buf[ZFS_MAX_DATASET_NAME_LEN];
 		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
 		    za.za_first_integer, FTAG, &clone));
 		dsl_dir_name(clone->ds_dir, buf);
 		fnvlist_add_boolean(val, buf);
 		dsl_dataset_rele(clone, FTAG);
 	}
 	zap_cursor_fini(&zc);
 	return (0);
 }
 
 void
 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
 {
 	nvlist_t *propval = fnvlist_alloc();
 	nvlist_t *val = fnvlist_alloc();
 
 	if (get_clones_stat_impl(ds, val) == 0) {
 		fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
 		fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
 		    propval);
 	}
 
 	nvlist_free(val);
 	nvlist_free(propval);
 }
 
 static char *
 get_receive_resume_token_impl(dsl_dataset_t *ds)
 {
 	if (!dsl_dataset_has_resume_receive_state(ds))
 		return (NULL);
 
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	char *str;
 	void *packed;
 	uint8_t *compressed;
 	uint64_t val;
 	nvlist_t *token_nv = fnvlist_alloc();
 	size_t packed_size, compressed_size;
 
 	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
 		fnvlist_add_uint64(token_nv, "fromguid", val);
 	}
 	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
 		fnvlist_add_uint64(token_nv, "object", val);
 	}
 	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
 		fnvlist_add_uint64(token_nv, "offset", val);
 	}
 	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
 		fnvlist_add_uint64(token_nv, "bytes", val);
 	}
 	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
 		fnvlist_add_uint64(token_nv, "toguid", val);
 	}
 	char buf[MAXNAMELEN];
 	if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
 		fnvlist_add_string(token_nv, "toname", buf);
 	}
 	if (zap_contains(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_LARGEBLOCK) == 0) {
 		fnvlist_add_boolean(token_nv, "largeblockok");
 	}
 	if (zap_contains(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_EMBEDOK) == 0) {
 		fnvlist_add_boolean(token_nv, "embedok");
 	}
 	if (zap_contains(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_COMPRESSOK) == 0) {
 		fnvlist_add_boolean(token_nv, "compressok");
 	}
 	if (zap_contains(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_RAWOK) == 0) {
 		fnvlist_add_boolean(token_nv, "rawok");
 	}
 	if (dsl_dataset_feature_is_active(ds,
 	    SPA_FEATURE_REDACTED_DATASETS)) {
 		uint64_t num_redact_snaps = 0;
 		uint64_t *redact_snaps = NULL;
 		VERIFY3B(dsl_dataset_get_uint64_array_feature(ds,
 		    SPA_FEATURE_REDACTED_DATASETS, &num_redact_snaps,
 		    &redact_snaps), ==, B_TRUE);
 		fnvlist_add_uint64_array(token_nv, "redact_snaps",
 		    redact_snaps, num_redact_snaps);
 	}
 	if (zap_contains(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS) == 0) {
 		uint64_t num_redact_snaps = 0, int_size = 0;
 		uint64_t *redact_snaps = NULL;
 		VERIFY0(zap_length(dp->dp_meta_objset, ds->ds_object,
 		    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, &int_size,
 		    &num_redact_snaps));
 		ASSERT3U(int_size, ==, sizeof (uint64_t));
 
 		redact_snaps = kmem_alloc(int_size * num_redact_snaps,
 		    KM_SLEEP);
 		VERIFY0(zap_lookup(dp->dp_meta_objset, ds->ds_object,
 		    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, int_size,
 		    num_redact_snaps, redact_snaps));
 		fnvlist_add_uint64_array(token_nv, "book_redact_snaps",
 		    redact_snaps, num_redact_snaps);
 		kmem_free(redact_snaps, int_size * num_redact_snaps);
 	}
 	packed = fnvlist_pack(token_nv, &packed_size);
 	fnvlist_free(token_nv);
 	compressed = kmem_alloc(packed_size, KM_SLEEP);
 
 	compressed_size = gzip_compress(packed, compressed,
 	    packed_size, packed_size, 6);
 
 	zio_cksum_t cksum;
 	fletcher_4_native_varsize(compressed, compressed_size, &cksum);
 
 	size_t alloc_size = compressed_size * 2 + 1;
 	str = kmem_alloc(alloc_size, KM_SLEEP);
 	for (int i = 0; i < compressed_size; i++) {
 		size_t offset = i * 2;
 		(void) snprintf(str + offset, alloc_size - offset,
 	    "%02x", compressed[i]);
 	}
 	str[compressed_size * 2] = '\0';
 	char *propval = kmem_asprintf("%u-%llx-%llx-%s",
 	    ZFS_SEND_RESUME_TOKEN_VERSION,
 	    (longlong_t)cksum.zc_word[0],
 	    (longlong_t)packed_size, str);
 	kmem_free(packed, packed_size);
 	kmem_free(str, alloc_size);
 	kmem_free(compressed, packed_size);
 	return (propval);
 }
 
 /*
  * Returns a string that represents the receive resume state token. It should
  * be freed with strfree(). NULL is returned if no resume state is present.
  */
 char *
 get_receive_resume_token(dsl_dataset_t *ds)
 {
 	/*
 	 * A failed "newfs" (e.g. full) resumable receive leaves
 	 * the stats set on this dataset.  Check here for the prop.
 	 */
 	char *token = get_receive_resume_token_impl(ds);
 	if (token != NULL)
 		return (token);
 	/*
 	 * A failed incremental resumable receive leaves the
 	 * stats set on our child named "%recv".  Check the child
 	 * for the prop.
 	 */
 	/* 6 extra bytes for /%recv */
 	char name[ZFS_MAX_DATASET_NAME_LEN + 6];
 	dsl_dataset_t *recv_ds;
 	dsl_dataset_name(ds, name);
 	if (strlcat(name, "/", sizeof (name)) < sizeof (name) &&
 	    strlcat(name, recv_clone_name, sizeof (name)) < sizeof (name) &&
 	    dsl_dataset_hold(ds->ds_dir->dd_pool, name, FTAG, &recv_ds) == 0) {
 		token = get_receive_resume_token_impl(recv_ds);
 		dsl_dataset_rele(recv_ds, FTAG);
 	}
 	return (token);
 }
 
 uint64_t
 dsl_get_refratio(dsl_dataset_t *ds)
 {
 	uint64_t ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
 	    (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
 	    dsl_dataset_phys(ds)->ds_compressed_bytes);
 	return (ratio);
 }
 
 uint64_t
 dsl_get_logicalreferenced(dsl_dataset_t *ds)
 {
 	return (dsl_dataset_phys(ds)->ds_uncompressed_bytes);
 }
 
 uint64_t
 dsl_get_compressratio(dsl_dataset_t *ds)
 {
 	if (ds->ds_is_snapshot) {
 		return (dsl_get_refratio(ds));
 	} else {
 		dsl_dir_t *dd = ds->ds_dir;
 		mutex_enter(&dd->dd_lock);
 		uint64_t val = dsl_dir_get_compressratio(dd);
 		mutex_exit(&dd->dd_lock);
 		return (val);
 	}
 }
 
 uint64_t
 dsl_get_used(dsl_dataset_t *ds)
 {
 	if (ds->ds_is_snapshot) {
 		return (dsl_dataset_phys(ds)->ds_unique_bytes);
 	} else {
 		dsl_dir_t *dd = ds->ds_dir;
 		mutex_enter(&dd->dd_lock);
 		uint64_t val = dsl_dir_get_used(dd);
 		mutex_exit(&dd->dd_lock);
 		return (val);
 	}
 }
 
 uint64_t
 dsl_get_creation(dsl_dataset_t *ds)
 {
 	return (dsl_dataset_phys(ds)->ds_creation_time);
 }
 
 uint64_t
 dsl_get_creationtxg(dsl_dataset_t *ds)
 {
 	return (dsl_dataset_phys(ds)->ds_creation_txg);
 }
 
 uint64_t
 dsl_get_refquota(dsl_dataset_t *ds)
 {
 	return (ds->ds_quota);
 }
 
 uint64_t
 dsl_get_refreservation(dsl_dataset_t *ds)
 {
 	return (ds->ds_reserved);
 }
 
 uint64_t
 dsl_get_guid(dsl_dataset_t *ds)
 {
 	return (dsl_dataset_phys(ds)->ds_guid);
 }
 
 uint64_t
 dsl_get_unique(dsl_dataset_t *ds)
 {
 	return (dsl_dataset_phys(ds)->ds_unique_bytes);
 }
 
 uint64_t
 dsl_get_objsetid(dsl_dataset_t *ds)
 {
 	return (ds->ds_object);
 }
 
 uint64_t
 dsl_get_userrefs(dsl_dataset_t *ds)
 {
 	return (ds->ds_userrefs);
 }
 
 uint64_t
 dsl_get_defer_destroy(dsl_dataset_t *ds)
 {
 	return (DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
 }
 
 uint64_t
 dsl_get_referenced(dsl_dataset_t *ds)
 {
 	return (dsl_dataset_phys(ds)->ds_referenced_bytes);
 }
 
 uint64_t
 dsl_get_numclones(dsl_dataset_t *ds)
 {
 	ASSERT(ds->ds_is_snapshot);
 	return (dsl_dataset_phys(ds)->ds_num_children - 1);
 }
 
 uint64_t
 dsl_get_inconsistent(dsl_dataset_t *ds)
 {
 	return ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) ?
 	    1 : 0);
 }
 
 uint64_t
 dsl_get_redacted(dsl_dataset_t *ds)
 {
 	return (dsl_dataset_feature_is_active(ds,
 	    SPA_FEATURE_REDACTED_DATASETS));
 }
 
 uint64_t
 dsl_get_available(dsl_dataset_t *ds)
 {
 	uint64_t refdbytes = dsl_get_referenced(ds);
 	uint64_t availbytes = dsl_dir_space_available(ds->ds_dir,
 	    NULL, 0, TRUE);
 	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
 		availbytes +=
 		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
 	}
 	if (ds->ds_quota != 0) {
 		/*
 		 * Adjust available bytes according to refquota
 		 */
 		if (refdbytes < ds->ds_quota) {
 			availbytes = MIN(availbytes,
 			    ds->ds_quota - refdbytes);
 		} else {
 			availbytes = 0;
 		}
 	}
 	return (availbytes);
 }
 
 int
 dsl_get_written(dsl_dataset_t *ds, uint64_t *written)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dsl_dataset_t *prev;
 	int err = dsl_dataset_hold_obj(dp,
 	    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 	if (err == 0) {
 		uint64_t comp, uncomp;
 		err = dsl_dataset_space_written(prev, ds, written,
 		    &comp, &uncomp);
 		dsl_dataset_rele(prev, FTAG);
 	}
 	return (err);
 }
 
 /*
  * 'snap' should be a buffer of size ZFS_MAX_DATASET_NAME_LEN.
  */
 int
 dsl_get_prev_snap(dsl_dataset_t *ds, char *snap)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
 		dsl_dataset_name(ds->ds_prev, snap);
 		return (0);
 	} else {
 		return (SET_ERROR(ENOENT));
 	}
 }
 
 void
 dsl_get_redact_snaps(dsl_dataset_t *ds, nvlist_t *propval)
 {
 	uint64_t nsnaps;
 	uint64_t *snaps;
 	if (dsl_dataset_get_uint64_array_feature(ds,
 	    SPA_FEATURE_REDACTED_DATASETS, &nsnaps, &snaps)) {
 		fnvlist_add_uint64_array(propval, ZPROP_VALUE, snaps,
 		    nsnaps);
 	}
 }
 
 /*
  * Returns the mountpoint property and source for the given dataset in the value
  * and source buffers. The value buffer must be at least as large as MAXPATHLEN
  * and the source buffer as least as large a ZFS_MAX_DATASET_NAME_LEN.
  * Returns 0 on success and an error on failure.
  */
 int
 dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value,
     char *source)
 {
 	int error;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
 	/* Retrieve the mountpoint value stored in the zap object */
 	error = dsl_prop_get_ds(ds, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), 1,
 	    ZAP_MAXVALUELEN, value, source);
 	if (error != 0) {
 		return (error);
 	}
 
 	/*
 	 * Process the dsname and source to find the full mountpoint string.
 	 * Can be skipped for 'legacy' or 'none'.
 	 */
 	if (value[0] == '/') {
 		char *buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
 		char *root = buf;
 		const char *relpath;
 
 		/*
 		 * If we inherit the mountpoint, even from a dataset
 		 * with a received value, the source will be the path of
 		 * the dataset we inherit from. If source is
 		 * ZPROP_SOURCE_VAL_RECVD, the received value is not
 		 * inherited.
 		 */
 		if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {
 			relpath = "";
 		} else {
 			ASSERT0(strncmp(dsname, source, strlen(source)));
 			relpath = dsname + strlen(source);
 			if (relpath[0] == '/')
 				relpath++;
 		}
 
 		spa_altroot(dp->dp_spa, root, ZAP_MAXVALUELEN);
 
 		/*
 		 * Special case an alternate root of '/'. This will
 		 * avoid having multiple leading slashes in the
 		 * mountpoint path.
 		 */
 		if (strcmp(root, "/") == 0)
 			root++;
 
 		/*
 		 * If the mountpoint is '/' then skip over this
 		 * if we are obtaining either an alternate root or
 		 * an inherited mountpoint.
 		 */
 		char *mnt = value;
 		if (value[1] == '\0' && (root[0] != '\0' ||
 		    relpath[0] != '\0'))
 			mnt = value + 1;
 
 		mnt = kmem_strdup(mnt);
 
 		if (relpath[0] == '\0') {
 			(void) snprintf(value, ZAP_MAXVALUELEN, "%s%s",
 			    root, mnt);
 		} else {
 			(void) snprintf(value, ZAP_MAXVALUELEN, "%s%s%s%s",
 			    root, mnt, relpath[0] == '@' ? "" : "/",
 			    relpath);
 		}
 		kmem_free(buf, ZAP_MAXVALUELEN);
 		kmem_strfree(mnt);
 	}
 
 	return (0);
 }
 
 void
 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 {
 	dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO,
 	    dsl_get_refratio(ds));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
 	    dsl_get_logicalreferenced(ds));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
 	    dsl_get_compressratio(ds));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
 	    dsl_get_used(ds));
 
 	if (ds->ds_is_snapshot) {
 		get_clones_stat(ds, nv);
 	} else {
 		char buf[ZFS_MAX_DATASET_NAME_LEN];
 		if (dsl_get_prev_snap(ds, buf) == 0)
 			dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP,
 			    buf);
 		dsl_dir_stats(ds->ds_dir, nv);
 	}
 
 	nvlist_t *propval = fnvlist_alloc();
 	dsl_get_redact_snaps(ds, propval);
 	fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS),
 	    propval);
 	nvlist_free(propval);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,
 	    dsl_get_available(ds));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
 	    dsl_get_referenced(ds));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
 	    dsl_get_creation(ds));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
 	    dsl_get_creationtxg(ds));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
 	    dsl_get_refquota(ds));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
 	    dsl_get_refreservation(ds));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
 	    dsl_get_guid(ds));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
 	    dsl_get_unique(ds));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
 	    dsl_get_objsetid(ds));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
 	    dsl_get_userrefs(ds));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
 	    dsl_get_defer_destroy(ds));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOTS_CHANGED,
 	    dsl_dir_snap_cmtime(ds->ds_dir).tv_sec);
 	dsl_dataset_crypt_stats(ds, nv);
 
 	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		uint64_t written;
 		if (dsl_get_written(ds, &written) == 0) {
 			dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
 			    written);
 		}
 	}
 
 	if (!dsl_dataset_is_snapshot(ds)) {
 		char *token = get_receive_resume_token(ds);
 		if (token != NULL) {
 			dsl_prop_nvlist_add_string(nv,
 			    ZFS_PROP_RECEIVE_RESUME_TOKEN, token);
 			kmem_strfree(token);
 		}
 	}
 }
 
 void
 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
 {
 	dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool;
 	ASSERT(dsl_pool_config_held(dp));
 
 	stat->dds_creation_txg = dsl_get_creationtxg(ds);
 	stat->dds_inconsistent = dsl_get_inconsistent(ds);
 	stat->dds_guid = dsl_get_guid(ds);
 	stat->dds_redacted = dsl_get_redacted(ds);
 	stat->dds_origin[0] = '\0';
 	if (ds->ds_is_snapshot) {
 		stat->dds_is_snapshot = B_TRUE;
 		stat->dds_num_clones = dsl_get_numclones(ds);
 	} else {
 		stat->dds_is_snapshot = B_FALSE;
 		stat->dds_num_clones = 0;
 
 		if (dsl_dir_is_clone(ds->ds_dir)) {
 			dsl_dir_get_origin(ds->ds_dir, stat->dds_origin);
 		}
 	}
 }
 
 uint64_t
 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
 {
 	return (ds->ds_fsid_guid);
 }
 
 void
 dsl_dataset_space(dsl_dataset_t *ds,
     uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
 	*refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
 	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
 	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
 		*availbytesp +=
 		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
 	if (ds->ds_quota != 0) {
 		/*
 		 * Adjust available bytes according to refquota
 		 */
 		if (*refdbytesp < ds->ds_quota)
 			*availbytesp = MIN(*availbytesp,
 			    ds->ds_quota - *refdbytesp);
 		else
 			*availbytesp = 0;
 	}
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	*usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
 }
 
 boolean_t
 dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
 {
 	dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool;
 	uint64_t birth;
 
 	ASSERT(dsl_pool_config_held(dp));
 	if (snap == NULL)
 		return (B_FALSE);
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	birth = dsl_dataset_get_blkptr(ds)->blk_birth;
+	birth = BP_GET_LOGICAL_BIRTH(dsl_dataset_get_blkptr(ds));
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
 		objset_t *os, *os_snap;
 		/*
 		 * It may be that only the ZIL differs, because it was
 		 * reset in the head.  Don't count that as being
 		 * modified.
 		 */
 		if (dmu_objset_from_ds(ds, &os) != 0)
 			return (B_TRUE);
 		if (dmu_objset_from_ds(snap, &os_snap) != 0)
 			return (B_TRUE);
 		return (memcmp(&os->os_phys->os_meta_dnode,
 		    &os_snap->os_phys->os_meta_dnode,
 		    sizeof (os->os_phys->os_meta_dnode)) != 0);
 	}
 	return (B_FALSE);
 }
 
 static int
 dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
     dsl_dataset_t *hds, void *arg)
 {
 	(void) dp;
 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
 	int error;
 	uint64_t val;
 
 	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
 	if (error != 0) {
 		/* ignore nonexistent snapshots */
 		return (error == ENOENT ? 0 : error);
 	}
 
 	/* new name should not exist */
 	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
 	if (error == 0)
 		error = SET_ERROR(EEXIST);
 	else if (error == ENOENT)
 		error = 0;
 
 	/* dataset name + 1 for the "@" + the new snapshot name must fit */
 	if (dsl_dir_namelen(hds->ds_dir) + 1 +
 	    strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN)
 		error = SET_ERROR(ENAMETOOLONG);
 
 	return (error);
 }
 
 int
 dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *hds;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
 	if (error != 0)
 		return (error);
 
 	if (ddrsa->ddrsa_recursive) {
 		error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
 		    dsl_dataset_rename_snapshot_check_impl, ddrsa,
 		    DS_FIND_CHILDREN);
 	} else {
 		error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
 	}
 	dsl_dataset_rele(hds, FTAG);
 	return (error);
 }
 
 static int
 dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
     dsl_dataset_t *hds, void *arg)
 {
 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
 	dsl_dataset_t *ds;
 	uint64_t val;
 	dmu_tx_t *tx = ddrsa->ddrsa_tx;
 	int error;
 
 	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
 	ASSERT(error == 0 || error == ENOENT);
 	if (error == ENOENT) {
 		/* ignore nonexistent snapshots */
 		return (0);
 	}
 
 	VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
 
 	/* log before we change the name */
 	spa_history_log_internal_ds(ds, "rename", tx,
 	    "-> @%s", ddrsa->ddrsa_newsnapname);
 
 	VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
 	    B_FALSE));
 	mutex_enter(&ds->ds_lock);
 	(void) strlcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname,
 	    sizeof (ds->ds_snapname));
 	mutex_exit(&ds->ds_lock);
 	VERIFY0(zap_add(dp->dp_meta_objset,
 	    dsl_dataset_phys(hds)->ds_snapnames_zapobj,
 	    ds->ds_snapname, 8, 1, &ds->ds_object, tx));
 	zvol_rename_minors(dp->dp_spa, ddrsa->ddrsa_oldsnapname,
 	    ddrsa->ddrsa_newsnapname, B_TRUE);
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 void
 dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *hds = NULL;
 
 	VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
 	ddrsa->ddrsa_tx = tx;
 	if (ddrsa->ddrsa_recursive) {
 		VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
 		    dsl_dataset_rename_snapshot_sync_impl, ddrsa,
 		    DS_FIND_CHILDREN));
 	} else {
 		VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
 	}
 	dsl_dataset_rele(hds, FTAG);
 }
 
 int
 dsl_dataset_rename_snapshot(const char *fsname,
     const char *oldsnapname, const char *newsnapname, boolean_t recursive)
 {
 	dsl_dataset_rename_snapshot_arg_t ddrsa;
 
 	ddrsa.ddrsa_fsname = fsname;
 	ddrsa.ddrsa_oldsnapname = oldsnapname;
 	ddrsa.ddrsa_newsnapname = newsnapname;
 	ddrsa.ddrsa_recursive = recursive;
 
 	return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
 	    dsl_dataset_rename_snapshot_sync, &ddrsa,
 	    1, ZFS_SPACE_CHECK_RESERVED));
 }
 
 /*
  * If we're doing an ownership handoff, we need to make sure that there is
  * only one long hold on the dataset.  We're not allowed to change anything here
  * so we don't permanently release the long hold or regular hold here.  We want
  * to do this only when syncing to avoid the dataset unexpectedly going away
  * when we release the long hold.
  */
 static int
 dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
 {
 	boolean_t held = B_FALSE;
 
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	dsl_dir_t *dd = ds->ds_dir;
 	mutex_enter(&dd->dd_activity_lock);
 	uint64_t holds = zfs_refcount_count(&ds->ds_longholds) -
 	    (owner != NULL ? 1 : 0);
 	/*
 	 * The value of dd_activity_waiters can chance as soon as we drop the
 	 * lock, but we're fine with that; new waiters coming in or old
 	 * waiters leaving doesn't cause problems, since we're going to cancel
 	 * waiters later anyway. The goal of this check is to verify that no
 	 * non-waiters have long-holds, and all new long-holds will be
 	 * prevented because we're holding the pool config as writer.
 	 */
 	if (holds != dd->dd_activity_waiters)
 		held = B_TRUE;
 	mutex_exit(&dd->dd_activity_lock);
 
 	if (held)
 		return (SET_ERROR(EBUSY));
 
 	return (0);
 }
 
 int
 dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_rollback_arg_t *ddra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int64_t unused_refres_delta;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	/* must not be a snapshot */
 	if (ds->ds_is_snapshot) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/* must have a most recent snapshot */
 	if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(ESRCH));
 	}
 
 	/*
 	 * No rollback to a snapshot created in the current txg, because
 	 * the rollback may dirty the dataset and create blocks that are
 	 * not reachable from the rootbp while having a birth txg that
 	 * falls into the snapshot's range.
 	 */
 	if (dmu_tx_is_syncing(tx) &&
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EAGAIN));
 	}
 
 	/*
 	 * If the expected target snapshot is specified, then check that
 	 * the latest snapshot is it.
 	 */
 	if (ddra->ddra_tosnap != NULL) {
 		dsl_dataset_t *snapds;
 
 		/* Check if the target snapshot exists at all. */
 		error = dsl_dataset_hold(dp, ddra->ddra_tosnap, FTAG, &snapds);
 		if (error != 0) {
 			/*
 			 * ESRCH is used to signal that the target snapshot does
 			 * not exist, while ENOENT is used to report that
 			 * the rolled back dataset does not exist.
 			 * ESRCH is also used to cover other cases where the
 			 * target snapshot is not related to the dataset being
 			 * rolled back such as being in a different pool.
 			 */
 			if (error == ENOENT || error == EXDEV)
 				error = SET_ERROR(ESRCH);
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 		ASSERT(snapds->ds_is_snapshot);
 
 		/* Check if the snapshot is the latest snapshot indeed. */
 		if (snapds != ds->ds_prev) {
 			/*
 			 * Distinguish between the case where the only problem
 			 * is intervening snapshots (EEXIST) vs the snapshot
 			 * not being a valid target for rollback (ESRCH).
 			 */
 			if (snapds->ds_dir == ds->ds_dir ||
 			    (dsl_dir_is_clone(ds->ds_dir) &&
 			    dsl_dir_phys(ds->ds_dir)->dd_origin_obj ==
 			    snapds->ds_object)) {
 				error = SET_ERROR(EEXIST);
 			} else {
 				error = SET_ERROR(ESRCH);
 			}
 			dsl_dataset_rele(snapds, FTAG);
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 		dsl_dataset_rele(snapds, FTAG);
 	}
 
 	/* must not have any bookmarks after the most recent snapshot */
 	if (dsl_bookmark_latest_txg(ds) >
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EEXIST));
 	}
 
 	error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Check if the snap we are rolling back to uses more than
 	 * the refquota.
 	 */
 	if (ds->ds_quota != 0 &&
 	    dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	/*
 	 * When we do the clone swap, we will temporarily use more space
 	 * due to the refreservation (the head will no longer have any
 	 * unique space, so the entire amount of the refreservation will need
 	 * to be free).  We will immediately destroy the clone, freeing
 	 * this space, but the freeing happens over many txg's.
 	 */
 	unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
 	    dsl_dataset_phys(ds)->ds_unique_bytes);
 
 	if (unused_refres_delta > 0 &&
 	    unused_refres_delta >
 	    dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 void
 dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_rollback_arg_t *ddra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds, *clone;
 	uint64_t cloneobj;
 	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
 
 	VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
 
 	dsl_dataset_name(ds->ds_prev, namebuf);
 	fnvlist_add_string(ddra->ddra_result, "target", namebuf);
 
 	cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
 	    ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, NULL, tx);
 
 	VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
 
 	dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
 	dsl_dataset_zero_zil(ds, tx);
 
 	dsl_destroy_head_sync_impl(clone, tx);
 
 	dsl_dataset_rele(clone, FTAG);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 /*
  * Rolls back the given filesystem or volume to the most recent snapshot.
  * The name of the most recent snapshot will be returned under key "target"
  * in the result nvlist.
  *
  * If owner != NULL:
  * - The existing dataset MUST be owned by the specified owner at entry
  * - Upon return, dataset will still be held by the same owner, whether we
  *   succeed or not.
  *
  * This mode is required any time the existing filesystem is mounted.  See
  * notes above zfs_suspend_fs() for further details.
  */
 int
 dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner,
     nvlist_t *result)
 {
 	dsl_dataset_rollback_arg_t ddra;
 
 	ddra.ddra_fsname = fsname;
 	ddra.ddra_tosnap = tosnap;
 	ddra.ddra_owner = owner;
 	ddra.ddra_result = result;
 
 	return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
 	    dsl_dataset_rollback_sync, &ddra,
 	    1, ZFS_SPACE_CHECK_RESERVED));
 }
 
 struct promotenode {
 	list_node_t link;
 	dsl_dataset_t *ds;
 };
 
 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
 static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
     const void *tag);
 static void promote_rele(dsl_dataset_promote_arg_t *ddpa, const void *tag);
 
 int
 dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_promote_arg_t *ddpa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *hds;
 	struct promotenode *snap;
 	int err;
 	uint64_t unused;
 	uint64_t ss_mv_cnt;
 	size_t max_snap_len;
 	boolean_t conflicting_snaps;
 
 	err = promote_hold(ddpa, dp, FTAG);
 	if (err != 0)
 		return (err);
 
 	hds = ddpa->ddpa_clone;
 	max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
 
 	if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
 		promote_rele(ddpa, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 
 	snap = list_head(&ddpa->shared_snaps);
 	if (snap == NULL) {
 		err = SET_ERROR(ENOENT);
 		goto out;
 	}
 	dsl_dataset_t *const origin_ds = snap->ds;
 
 	/*
 	 * Encrypted clones share a DSL Crypto Key with their origin's dsl dir.
 	 * When doing a promote we must make sure the encryption root for
 	 * both the target and the target's origin does not change to avoid
 	 * needing to rewrap encryption keys
 	 */
 	err = dsl_dataset_promote_crypt_check(hds->ds_dir, origin_ds->ds_dir);
 	if (err != 0)
 		goto out;
 
 	/*
 	 * Compute and check the amount of space to transfer.  Since this is
 	 * so expensive, don't do the preliminary check.
 	 */
 	if (!dmu_tx_is_syncing(tx)) {
 		promote_rele(ddpa, FTAG);
 		return (0);
 	}
 
 	/* compute origin's new unique space */
 	snap = list_tail(&ddpa->clone_snaps);
 	ASSERT(snap != NULL);
 	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
 	    origin_ds->ds_object);
 	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
 	    dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
 	    &ddpa->unique, &unused, &unused);
 
 	/*
 	 * Walk the snapshots that we are moving
 	 *
 	 * Compute space to transfer.  Consider the incremental changes
 	 * to used by each snapshot:
 	 * (my used) = (prev's used) + (blocks born) - (blocks killed)
 	 * So each snapshot gave birth to:
 	 * (blocks born) = (my used) - (prev's used) + (blocks killed)
 	 * So a sequence would look like:
 	 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
 	 * Which simplifies to:
 	 * uN + kN + kN-1 + ... + k1 + k0
 	 * Note however, if we stop before we reach the ORIGIN we get:
 	 * uN + kN + kN-1 + ... + kM - uM-1
 	 */
 	conflicting_snaps = B_FALSE;
 	ss_mv_cnt = 0;
 	ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
 	ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
 	ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
 	for (snap = list_head(&ddpa->shared_snaps); snap;
 	    snap = list_next(&ddpa->shared_snaps, snap)) {
 		uint64_t val, dlused, dlcomp, dluncomp;
 		dsl_dataset_t *ds = snap->ds;
 
 		ss_mv_cnt++;
 
 		/*
 		 * If there are long holds, we won't be able to evict
 		 * the objset.
 		 */
 		if (dsl_dataset_long_held(ds)) {
 			err = SET_ERROR(EBUSY);
 			goto out;
 		}
 
 		/* Check that the snapshot name does not conflict */
 		VERIFY0(dsl_dataset_get_snapname(ds));
 		if (strlen(ds->ds_snapname) >= max_snap_len) {
 			err = SET_ERROR(ENAMETOOLONG);
 			goto out;
 		}
 		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
 		if (err == 0) {
 			fnvlist_add_boolean(ddpa->err_ds,
 			    snap->ds->ds_snapname);
 			conflicting_snaps = B_TRUE;
 		} else if (err != ENOENT) {
 			goto out;
 		}
 
 		/* The very first snapshot does not have a deadlist */
 		if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
 			continue;
 
 		dsl_deadlist_space(&ds->ds_deadlist,
 		    &dlused, &dlcomp, &dluncomp);
 		ddpa->used += dlused;
 		ddpa->comp += dlcomp;
 		ddpa->uncomp += dluncomp;
 	}
 
 	/*
 	 * Check that bookmarks that are being transferred don't have
 	 * name conflicts.
 	 */
 	for (dsl_bookmark_node_t *dbn = avl_first(&origin_ds->ds_bookmarks);
 	    dbn != NULL && dbn->dbn_phys.zbm_creation_txg <=
 	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
 	    dbn = AVL_NEXT(&origin_ds->ds_bookmarks, dbn)) {
 		if (strlen(dbn->dbn_name) >= max_snap_len) {
 			err = SET_ERROR(ENAMETOOLONG);
 			goto out;
 		}
 		zfs_bookmark_phys_t bm;
 		err = dsl_bookmark_lookup_impl(ddpa->ddpa_clone,
 		    dbn->dbn_name, &bm);
 
 		if (err == 0) {
 			fnvlist_add_boolean(ddpa->err_ds, dbn->dbn_name);
 			conflicting_snaps = B_TRUE;
 		} else if (err == ESRCH) {
 			err = 0;
 		}
 		if (err != 0) {
 			goto out;
 		}
 	}
 
 	/*
 	 * In order to return the full list of conflicting snapshots, we check
 	 * whether there was a conflict after traversing all of them.
 	 */
 	if (conflicting_snaps) {
 		err = SET_ERROR(EEXIST);
 		goto out;
 	}
 
 	/*
 	 * If we are a clone of a clone then we never reached ORIGIN,
 	 * so we need to subtract out the clone origin's used space.
 	 */
 	if (ddpa->origin_origin) {
 		ddpa->used -=
 		    dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
 		ddpa->comp -=
 		    dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
 		ddpa->uncomp -=
 		    dsl_dataset_phys(ddpa->origin_origin)->
 		    ds_uncompressed_bytes;
 	}
 
 	/* Check that there is enough space and limit headroom here */
 	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
 	    0, ss_mv_cnt, ddpa->used, ddpa->cr, ddpa->proc);
 	if (err != 0)
 		goto out;
 
 	/*
 	 * Compute the amounts of space that will be used by snapshots
 	 * after the promotion (for both origin and clone).  For each,
 	 * it is the amount of space that will be on all of their
 	 * deadlists (that was not born before their new origin).
 	 */
 	if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		uint64_t space;
 
 		/*
 		 * Note, typically this will not be a clone of a clone,
 		 * so dd_origin_txg will be < TXG_INITIAL, so
 		 * these snaplist_space() -> dsl_deadlist_space_range()
 		 * calls will be fast because they do not have to
 		 * iterate over all bps.
 		 */
 		snap = list_head(&ddpa->origin_snaps);
 		if (snap == NULL) {
 			err = SET_ERROR(ENOENT);
 			goto out;
 		}
 		err = snaplist_space(&ddpa->shared_snaps,
 		    snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
 		if (err != 0)
 			goto out;
 
 		err = snaplist_space(&ddpa->clone_snaps,
 		    snap->ds->ds_dir->dd_origin_txg, &space);
 		if (err != 0)
 			goto out;
 		ddpa->cloneusedsnap += space;
 	}
 	if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
 	    DD_FLAG_USED_BREAKDOWN) {
 		err = snaplist_space(&ddpa->origin_snaps,
 		    dsl_dataset_phys(origin_ds)->ds_creation_txg,
 		    &ddpa->originusedsnap);
 		if (err != 0)
 			goto out;
 	}
 
 out:
 	promote_rele(ddpa, FTAG);
 	return (err);
 }
 
 void
 dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_promote_arg_t *ddpa = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *hds;
 	struct promotenode *snap;
 	dsl_dataset_t *origin_ds;
 	dsl_dataset_t *origin_head;
 	dsl_dir_t *dd;
 	dsl_dir_t *odd = NULL;
 	uint64_t oldnext_obj;
 	int64_t delta;
 
 	ASSERT(nvlist_empty(ddpa->err_ds));
 
 	VERIFY0(promote_hold(ddpa, dp, FTAG));
 	hds = ddpa->ddpa_clone;
 
 	ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
 
 	snap = list_head(&ddpa->shared_snaps);
 	origin_ds = snap->ds;
 	dd = hds->ds_dir;
 
 	snap = list_head(&ddpa->origin_snaps);
 	origin_head = snap->ds;
 
 	/*
 	 * We need to explicitly open odd, since origin_ds's dd will be
 	 * changing.
 	 */
 	VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
 	    NULL, FTAG, &odd));
 
 	dsl_dataset_promote_crypt_sync(hds->ds_dir, odd, tx);
 
 	/* change origin's next snap */
 	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
 	oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
 	snap = list_tail(&ddpa->clone_snaps);
 	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
 	    origin_ds->ds_object);
 	dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
 
 	/* change the origin's next clone */
 	if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
 		dsl_dataset_remove_from_next_clones(origin_ds,
 		    snap->ds->ds_object, tx);
 		VERIFY0(zap_add_int(dp->dp_meta_objset,
 		    dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
 		    oldnext_obj, tx));
 	}
 
 	/* change origin */
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
 	dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
 	dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
 	dmu_buf_will_dirty(odd->dd_dbuf, tx);
 	dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
 	origin_head->ds_dir->dd_origin_txg =
 	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
 
 	/* change dd_clone entries */
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 		VERIFY0(zap_remove_int(dp->dp_meta_objset,
 		    dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
 		VERIFY0(zap_add_int(dp->dp_meta_objset,
 		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
 		    hds->ds_object, tx));
 
 		VERIFY0(zap_remove_int(dp->dp_meta_objset,
 		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
 		    origin_head->ds_object, tx));
 		if (dsl_dir_phys(dd)->dd_clones == 0) {
 			dsl_dir_phys(dd)->dd_clones =
 			    zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
 			    DMU_OT_NONE, 0, tx);
 		}
 		VERIFY0(zap_add_int(dp->dp_meta_objset,
 		    dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
 	}
 
 	/*
 	 * Move bookmarks to this dir.
 	 */
 	dsl_bookmark_node_t *dbn_next;
 	for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks);
 	    dbn != NULL && dbn->dbn_phys.zbm_creation_txg <=
 	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
 	    dbn = dbn_next) {
 		dbn_next = AVL_NEXT(&origin_head->ds_bookmarks, dbn);
 
 		avl_remove(&origin_head->ds_bookmarks, dbn);
 		VERIFY0(zap_remove(dp->dp_meta_objset,
 		    origin_head->ds_bookmarks_obj, dbn->dbn_name, tx));
 
 		dsl_bookmark_node_add(hds, dbn, tx);
 	}
 
 	dsl_bookmark_next_changed(hds, origin_ds, tx);
 
 	/* move snapshots to this dir */
 	for (snap = list_head(&ddpa->shared_snaps); snap;
 	    snap = list_next(&ddpa->shared_snaps, snap)) {
 		dsl_dataset_t *ds = snap->ds;
 
 		/*
 		 * Property callbacks are registered to a particular
 		 * dsl_dir.  Since ours is changing, evict the objset
 		 * so that they will be unregistered from the old dsl_dir.
 		 */
 		if (ds->ds_objset) {
 			dmu_objset_evict(ds->ds_objset);
 			ds->ds_objset = NULL;
 		}
 
 		/* move snap name entry */
 		VERIFY0(dsl_dataset_get_snapname(ds));
 		VERIFY0(dsl_dataset_snap_remove(origin_head,
 		    ds->ds_snapname, tx, B_TRUE));
 		VERIFY0(zap_add(dp->dp_meta_objset,
 		    dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
 		    8, 1, &ds->ds_object, tx));
 		dsl_fs_ss_count_adjust(hds->ds_dir, 1,
 		    DD_FIELD_SNAPSHOT_COUNT, tx);
 
 		/* change containing dsl_dir */
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
 		dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
 		ASSERT3P(ds->ds_dir, ==, odd);
 		dsl_dir_rele(ds->ds_dir, ds);
 		VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
 		    NULL, ds, &ds->ds_dir));
 
 		/* move any clone references */
 		if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
 		    spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 			zap_cursor_t zc;
 			zap_attribute_t za;
 
 			for (zap_cursor_init(&zc, dp->dp_meta_objset,
 			    dsl_dataset_phys(ds)->ds_next_clones_obj);
 			    zap_cursor_retrieve(&zc, &za) == 0;
 			    zap_cursor_advance(&zc)) {
 				dsl_dataset_t *cnds;
 				uint64_t o;
 
 				if (za.za_first_integer == oldnext_obj) {
 					/*
 					 * We've already moved the
 					 * origin's reference.
 					 */
 					continue;
 				}
 
 				VERIFY0(dsl_dataset_hold_obj(dp,
 				    za.za_first_integer, FTAG, &cnds));
 				o = dsl_dir_phys(cnds->ds_dir)->
 				    dd_head_dataset_obj;
 
 				VERIFY0(zap_remove_int(dp->dp_meta_objset,
 				    dsl_dir_phys(odd)->dd_clones, o, tx));
 				VERIFY0(zap_add_int(dp->dp_meta_objset,
 				    dsl_dir_phys(dd)->dd_clones, o, tx));
 				dsl_dataset_rele(cnds, FTAG);
 			}
 			zap_cursor_fini(&zc);
 		}
 
 		ASSERT(!dsl_prop_hascb(ds));
 	}
 
 	/*
 	 * Change space accounting.
 	 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
 	 * both be valid, or both be 0 (resulting in delta == 0).  This
 	 * is true for each of {clone,origin} independently.
 	 */
 
 	delta = ddpa->cloneusedsnap -
 	    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
 	ASSERT3S(delta, >=, 0);
 	ASSERT3U(ddpa->used, >=, delta);
 	dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
 	dsl_dir_diduse_space(dd, DD_USED_HEAD,
 	    ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
 
 	delta = ddpa->originusedsnap -
 	    dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
 	ASSERT3S(delta, <=, 0);
 	ASSERT3U(ddpa->used, >=, -delta);
 	dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
 	dsl_dir_diduse_space(odd, DD_USED_HEAD,
 	    -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
 
 	dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
 
 	/*
 	 * Since livelists are specific to a clone's origin txg, they
 	 * are no longer accurate. Destroy the livelist from the clone being
 	 * promoted. If the origin dataset is a clone, destroy its livelist
 	 * as well.
 	 */
 	dsl_dir_remove_livelist(dd, tx, B_TRUE);
 	dsl_dir_remove_livelist(odd, tx, B_TRUE);
 
 	/* log history record */
 	spa_history_log_internal_ds(hds, "promote", tx, " ");
 
 	dsl_dir_rele(odd, FTAG);
 	promote_rele(ddpa, FTAG);
 
 	/*
 	 * Transfer common error blocks from old head to new head.
 	 */
 	if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		uint64_t old_head = origin_head->ds_object;
 		uint64_t new_head = hds->ds_object;
 		spa_swap_errlog(dp->dp_spa, new_head, old_head, tx);
 	}
 }
 
 /*
  * Make a list of dsl_dataset_t's for the snapshots between first_obj
  * (exclusive) and last_obj (inclusive).  The list will be in reverse
  * order (last_obj will be the list_head()).  If first_obj == 0, do all
  * snapshots back to this dataset's origin.
  */
 static int
 snaplist_make(dsl_pool_t *dp,
     uint64_t first_obj, uint64_t last_obj, list_t *l, const void *tag)
 {
 	uint64_t obj = last_obj;
 
 	list_create(l, sizeof (struct promotenode),
 	    offsetof(struct promotenode, link));
 
 	while (obj != first_obj) {
 		dsl_dataset_t *ds;
 		struct promotenode *snap;
 		int err;
 
 		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
 		ASSERT(err != ENOENT);
 		if (err != 0)
 			return (err);
 
 		if (first_obj == 0)
 			first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
 
 		snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
 		snap->ds = ds;
 		list_insert_tail(l, snap);
 		obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	}
 
 	return (0);
 }
 
 static int
 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
 {
 	struct promotenode *snap;
 
 	*spacep = 0;
 	for (snap = list_head(l); snap; snap = list_next(l, snap)) {
 		uint64_t used, comp, uncomp;
 		dsl_deadlist_space_range(&snap->ds->ds_deadlist,
 		    mintxg, UINT64_MAX, &used, &comp, &uncomp);
 		*spacep += used;
 	}
 	return (0);
 }
 
 static void
 snaplist_destroy(list_t *l, const void *tag)
 {
 	struct promotenode *snap;
 
 	if (l == NULL || !list_link_active(&l->list_head))
 		return;
 
 	while ((snap = list_remove_tail(l)) != NULL) {
 		dsl_dataset_rele(snap->ds, tag);
 		kmem_free(snap, sizeof (*snap));
 	}
 	list_destroy(l);
 }
 
 static int
 promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, const void *tag)
 {
 	int error;
 	dsl_dir_t *dd;
 	struct promotenode *snap;
 
 	error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
 	    &ddpa->ddpa_clone);
 	if (error != 0)
 		return (error);
 	dd = ddpa->ddpa_clone->ds_dir;
 
 	if (ddpa->ddpa_clone->ds_is_snapshot ||
 	    !dsl_dir_is_clone(dd)) {
 		dsl_dataset_rele(ddpa->ddpa_clone, tag);
 		return (SET_ERROR(EINVAL));
 	}
 
 	error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
 	    &ddpa->shared_snaps, tag);
 	if (error != 0)
 		goto out;
 
 	error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
 	    &ddpa->clone_snaps, tag);
 	if (error != 0)
 		goto out;
 
 	snap = list_head(&ddpa->shared_snaps);
 	ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
 	error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
 	    dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
 	    &ddpa->origin_snaps, tag);
 	if (error != 0)
 		goto out;
 
 	if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
 		error = dsl_dataset_hold_obj(dp,
 		    dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
 		    tag, &ddpa->origin_origin);
 		if (error != 0)
 			goto out;
 	}
 out:
 	if (error != 0)
 		promote_rele(ddpa, tag);
 	return (error);
 }
 
 static void
 promote_rele(dsl_dataset_promote_arg_t *ddpa, const void *tag)
 {
 	snaplist_destroy(&ddpa->shared_snaps, tag);
 	snaplist_destroy(&ddpa->clone_snaps, tag);
 	snaplist_destroy(&ddpa->origin_snaps, tag);
 	if (ddpa->origin_origin != NULL)
 		dsl_dataset_rele(ddpa->origin_origin, tag);
 	dsl_dataset_rele(ddpa->ddpa_clone, tag);
 }
 
 /*
  * Promote a clone.
  *
  * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
  * in with the name.  (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.)
  */
 int
 dsl_dataset_promote(const char *name, char *conflsnap)
 {
 	dsl_dataset_promote_arg_t ddpa = { 0 };
 	uint64_t numsnaps;
 	int error;
 	nvpair_t *snap_pair;
 	objset_t *os;
 
 	/*
 	 * We will modify space proportional to the number of
 	 * snapshots.  Compute numsnaps.
 	 */
 	error = dmu_objset_hold(name, FTAG, &os);
 	if (error != 0)
 		return (error);
 	error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
 	    dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
 	    &numsnaps);
 	dmu_objset_rele(os, FTAG);
 	if (error != 0)
 		return (error);
 
 	ddpa.ddpa_clonename = name;
 	ddpa.err_ds = fnvlist_alloc();
 	ddpa.cr = CRED();
 	ddpa.proc = curproc;
 
 	error = dsl_sync_task(name, dsl_dataset_promote_check,
 	    dsl_dataset_promote_sync, &ddpa,
 	    2 + numsnaps, ZFS_SPACE_CHECK_RESERVED);
 
 	/*
 	 * Return the first conflicting snapshot found.
 	 */
 	snap_pair = nvlist_next_nvpair(ddpa.err_ds, NULL);
 	if (snap_pair != NULL && conflsnap != NULL)
 		(void) strlcpy(conflsnap, nvpair_name(snap_pair),
 		    ZFS_MAX_DATASET_NAME_LEN);
 
 	fnvlist_free(ddpa.err_ds);
 	return (error);
 }
 
 int
 dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
     dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
 {
 	/*
 	 * "slack" factor for received datasets with refquota set on them.
 	 * See the bottom of this function for details on its use.
 	 */
 	uint64_t refquota_slack = (uint64_t)DMU_MAX_ACCESS *
 	    spa_asize_inflation;
 	int64_t unused_refres_delta;
 
 	/* they should both be heads */
 	if (clone->ds_is_snapshot ||
 	    origin_head->ds_is_snapshot)
 		return (SET_ERROR(EINVAL));
 
 	/* if we are not forcing, the branch point should be just before them */
 	if (!force && clone->ds_prev != origin_head->ds_prev)
 		return (SET_ERROR(EINVAL));
 
 	/* clone should be the clone (unless they are unrelated) */
 	if (clone->ds_prev != NULL &&
 	    clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
 	    origin_head->ds_dir != clone->ds_prev->ds_dir)
 		return (SET_ERROR(EINVAL));
 
 	/* the clone should be a child of the origin */
 	if (clone->ds_dir->dd_parent != origin_head->ds_dir)
 		return (SET_ERROR(EINVAL));
 
 	/* origin_head shouldn't be modified unless 'force' */
 	if (!force &&
 	    dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
 		return (SET_ERROR(ETXTBSY));
 
 	/* origin_head should have no long holds (e.g. is not mounted) */
 	if (dsl_dataset_handoff_check(origin_head, owner, tx))
 		return (SET_ERROR(EBUSY));
 
 	/* check amount of any unconsumed refreservation */
 	unused_refres_delta =
 	    (int64_t)MIN(origin_head->ds_reserved,
 	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
 	    (int64_t)MIN(origin_head->ds_reserved,
 	    dsl_dataset_phys(clone)->ds_unique_bytes);
 
 	if (unused_refres_delta > 0 &&
 	    unused_refres_delta >
 	    dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
 		return (SET_ERROR(ENOSPC));
 
 	/*
 	 * The clone can't be too much over the head's refquota.
 	 *
 	 * To ensure that the entire refquota can be used, we allow one
 	 * transaction to exceed the refquota.  Therefore, this check
 	 * needs to also allow for the space referenced to be more than the
 	 * refquota.  The maximum amount of space that one transaction can use
 	 * on disk is DMU_MAX_ACCESS * spa_asize_inflation.  Allowing this
 	 * overage ensures that we are able to receive a filesystem that
 	 * exceeds the refquota on the source system.
 	 *
 	 * So that overage is the refquota_slack we use below.
 	 */
 	if (origin_head->ds_quota != 0 &&
 	    dsl_dataset_phys(clone)->ds_referenced_bytes >
 	    origin_head->ds_quota + refquota_slack)
 		return (SET_ERROR(EDQUOT));
 
 	return (0);
 }
 
 static void
 dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone,
     dsl_dataset_t *origin, dmu_tx_t *tx)
 {
 	uint64_t clone_remap_dl_obj, origin_remap_dl_obj;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 
 	ASSERT(dsl_pool_sync_context(dp));
 
 	clone_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(clone);
 	origin_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(origin);
 
 	if (clone_remap_dl_obj != 0) {
 		dsl_deadlist_close(&clone->ds_remap_deadlist);
 		dsl_dataset_unset_remap_deadlist_object(clone, tx);
 	}
 	if (origin_remap_dl_obj != 0) {
 		dsl_deadlist_close(&origin->ds_remap_deadlist);
 		dsl_dataset_unset_remap_deadlist_object(origin, tx);
 	}
 
 	if (clone_remap_dl_obj != 0) {
 		dsl_dataset_set_remap_deadlist_object(origin,
 		    clone_remap_dl_obj, tx);
 		dsl_deadlist_open(&origin->ds_remap_deadlist,
 		    dp->dp_meta_objset, clone_remap_dl_obj);
 	}
 	if (origin_remap_dl_obj != 0) {
 		dsl_dataset_set_remap_deadlist_object(clone,
 		    origin_remap_dl_obj, tx);
 		dsl_deadlist_open(&clone->ds_remap_deadlist,
 		    dp->dp_meta_objset, origin_remap_dl_obj);
 	}
 }
 
 void
 dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
     dsl_dataset_t *origin_head, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	int64_t unused_refres_delta;
 
 	ASSERT(clone->ds_reserved == 0);
 	/*
 	 * NOTE: On DEBUG kernels there could be a race between this and
 	 * the check function if spa_asize_inflation is adjusted...
 	 */
 	ASSERT(origin_head->ds_quota == 0 ||
 	    dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota +
 	    DMU_MAX_ACCESS * spa_asize_inflation);
 	ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
 
 	dsl_dir_cancel_waiters(origin_head->ds_dir);
 
 	/*
 	 * Swap per-dataset feature flags.
 	 */
 	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 		if (!(spa_feature_table[f].fi_flags &
 		    ZFEATURE_FLAG_PER_DATASET)) {
 			ASSERT(!dsl_dataset_feature_is_active(clone, f));
 			ASSERT(!dsl_dataset_feature_is_active(origin_head, f));
 			continue;
 		}
 
 		boolean_t clone_inuse = dsl_dataset_feature_is_active(clone, f);
 		void *clone_feature = clone->ds_feature[f];
 		boolean_t origin_head_inuse =
 		    dsl_dataset_feature_is_active(origin_head, f);
 		void *origin_head_feature = origin_head->ds_feature[f];
 
 		if (clone_inuse)
 			dsl_dataset_deactivate_feature_impl(clone, f, tx);
 		if (origin_head_inuse)
 			dsl_dataset_deactivate_feature_impl(origin_head, f, tx);
 
 		if (clone_inuse) {
 			dsl_dataset_activate_feature(origin_head->ds_object, f,
 			    clone_feature, tx);
 			origin_head->ds_feature[f] = clone_feature;
 		}
 		if (origin_head_inuse) {
 			dsl_dataset_activate_feature(clone->ds_object, f,
 			    origin_head_feature, tx);
 			clone->ds_feature[f] = origin_head_feature;
 		}
 	}
 
 	dmu_buf_will_dirty(clone->ds_dbuf, tx);
 	dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
 
 	if (clone->ds_objset != NULL) {
 		dmu_objset_evict(clone->ds_objset);
 		clone->ds_objset = NULL;
 	}
 
 	if (origin_head->ds_objset != NULL) {
 		dmu_objset_evict(origin_head->ds_objset);
 		origin_head->ds_objset = NULL;
 	}
 
 	unused_refres_delta =
 	    (int64_t)MIN(origin_head->ds_reserved,
 	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
 	    (int64_t)MIN(origin_head->ds_reserved,
 	    dsl_dataset_phys(clone)->ds_unique_bytes);
 
 	/*
 	 * Reset origin's unique bytes.
 	 */
 	{
 		dsl_dataset_t *origin = clone->ds_prev;
 		uint64_t comp, uncomp;
 
 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
 		dsl_deadlist_space_range(&clone->ds_deadlist,
 		    dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
 		    &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
 	}
 
 	/* swap blkptrs */
 	{
 		rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG);
 		rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG);
 		blkptr_t tmp;
 		tmp = dsl_dataset_phys(origin_head)->ds_bp;
 		dsl_dataset_phys(origin_head)->ds_bp =
 		    dsl_dataset_phys(clone)->ds_bp;
 		dsl_dataset_phys(clone)->ds_bp = tmp;
 		rrw_exit(&origin_head->ds_bp_rwlock, FTAG);
 		rrw_exit(&clone->ds_bp_rwlock, FTAG);
 	}
 
 	/* set dd_*_bytes */
 	{
 		int64_t dused, dcomp, duncomp;
 		uint64_t cdl_used, cdl_comp, cdl_uncomp;
 		uint64_t odl_used, odl_comp, odl_uncomp;
 
 		ASSERT3U(dsl_dir_phys(clone->ds_dir)->
 		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
 
 		dsl_deadlist_space(&clone->ds_deadlist,
 		    &cdl_used, &cdl_comp, &cdl_uncomp);
 		dsl_deadlist_space(&origin_head->ds_deadlist,
 		    &odl_used, &odl_comp, &odl_uncomp);
 
 		dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
 		    cdl_used -
 		    (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
 		    odl_used);
 		dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
 		    cdl_comp -
 		    (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
 		    odl_comp);
 		duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
 		    cdl_uncomp -
 		    (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
 		    odl_uncomp);
 
 		dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
 		    dused, dcomp, duncomp, tx);
 		dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
 		    -dused, -dcomp, -duncomp, tx);
 
 		/*
 		 * The difference in the space used by snapshots is the
 		 * difference in snapshot space due to the head's
 		 * deadlist (since that's the only thing that's
 		 * changing that affects the snapused).
 		 */
 		dsl_deadlist_space_range(&clone->ds_deadlist,
 		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
 		    &cdl_used, &cdl_comp, &cdl_uncomp);
 		dsl_deadlist_space_range(&origin_head->ds_deadlist,
 		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
 		    &odl_used, &odl_comp, &odl_uncomp);
 		dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
 		    DD_USED_HEAD, DD_USED_SNAP, tx);
 	}
 
 	/* swap ds_*_bytes */
 	SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
 	    dsl_dataset_phys(clone)->ds_referenced_bytes);
 	SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
 	    dsl_dataset_phys(clone)->ds_compressed_bytes);
 	SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
 	    dsl_dataset_phys(clone)->ds_uncompressed_bytes);
 	SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
 	    dsl_dataset_phys(clone)->ds_unique_bytes);
 
 	/* apply any parent delta for change in unconsumed refreservation */
 	dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
 	    unused_refres_delta, 0, 0, tx);
 
 	/*
 	 * Swap deadlists.
 	 */
 	dsl_deadlist_close(&clone->ds_deadlist);
 	dsl_deadlist_close(&origin_head->ds_deadlist);
 	SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
 	    dsl_dataset_phys(clone)->ds_deadlist_obj);
 	dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
 	    dsl_dataset_phys(clone)->ds_deadlist_obj);
 	dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
 	    dsl_dataset_phys(origin_head)->ds_deadlist_obj);
 	dsl_dataset_swap_remap_deadlists(clone, origin_head, tx);
 
 	/*
 	 * If there is a bookmark at the origin, its "next dataset" is
 	 * changing, so we need to reset its FBN.
 	 */
 	dsl_bookmark_next_changed(origin_head, origin_head->ds_prev, tx);
 
 	dsl_scan_ds_clone_swapped(origin_head, clone, tx);
 
 	/*
 	 * Destroy any livelists associated with the clone or the origin,
 	 * since after the swap the corresponding livelists are no longer
 	 * valid.
 	 */
 	dsl_dir_remove_livelist(clone->ds_dir, tx, B_TRUE);
 	dsl_dir_remove_livelist(origin_head->ds_dir, tx, B_TRUE);
 
 	spa_history_log_internal_ds(clone, "clone swap", tx,
 	    "parent=%s", origin_head->ds_dir->dd_myname);
 }
 
 /*
  * Given a pool name and a dataset object number in that pool,
  * return the name of that dataset.
  */
 int
 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int error;
 
 	error = dsl_pool_hold(pname, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 	if (error == 0) {
 		dsl_dataset_name(ds, buf);
 		dsl_dataset_rele(ds, FTAG);
 	}
 	dsl_pool_rele(dp, FTAG);
 
 	return (error);
 }
 
 int
 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
     uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
 {
 	int error = 0;
 
 	ASSERT3S(asize, >, 0);
 
 	/*
 	 * *ref_rsrv is the portion of asize that will come from any
 	 * unconsumed refreservation space.
 	 */
 	*ref_rsrv = 0;
 
 	mutex_enter(&ds->ds_lock);
 	/*
 	 * Make a space adjustment for reserved bytes.
 	 */
 	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
 		ASSERT3U(*used, >=,
 		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
 		*used -=
 		    (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
 		*ref_rsrv =
 		    asize - MIN(asize, parent_delta(ds, asize + inflight));
 	}
 
 	if (!check_quota || ds->ds_quota == 0) {
 		mutex_exit(&ds->ds_lock);
 		return (0);
 	}
 	/*
 	 * If they are requesting more space, and our current estimate
 	 * is over quota, they get to try again unless the actual
 	 * on-disk is over quota and there are no pending changes (which
 	 * may free up space for us).
 	 */
 	if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
 	    ds->ds_quota) {
 		if (inflight > 0 ||
 		    dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
 			error = SET_ERROR(ERESTART);
 		else
 			error = SET_ERROR(EDQUOT);
 	}
 	mutex_exit(&ds->ds_lock);
 
 	return (error);
 }
 
 typedef struct dsl_dataset_set_qr_arg {
 	const char *ddsqra_name;
 	zprop_source_t ddsqra_source;
 	uint64_t ddsqra_value;
 } dsl_dataset_set_qr_arg_t;
 
 
 static int
 dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int error;
 	uint64_t newval;
 
 	if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
 		return (SET_ERROR(ENOTSUP));
 
 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	if (ds->ds_is_snapshot) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	error = dsl_prop_predict(ds->ds_dir,
 	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	if (newval == 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (0);
 	}
 
 	if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
 	    newval < ds->ds_reserved) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 static void
 dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds = NULL;
 	uint64_t newval;
 
 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 
 	dsl_prop_set_sync_impl(ds,
 	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
 	    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
 	    &ddsqra->ddsqra_value, tx);
 
 	VERIFY0(dsl_prop_get_int_ds(ds,
 	    zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
 
 	if (ds->ds_quota != newval) {
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		ds->ds_quota = newval;
 	}
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
     uint64_t refquota)
 {
 	dsl_dataset_set_qr_arg_t ddsqra;
 
 	ddsqra.ddsqra_name = dsname;
 	ddsqra.ddsqra_source = source;
 	ddsqra.ddsqra_value = refquota;
 
 	return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
 	    dsl_dataset_set_refquota_sync, &ddsqra, 0,
 	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 static int
 dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int error;
 	uint64_t newval, unique;
 
 	if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
 		return (SET_ERROR(ENOTSUP));
 
 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	if (ds->ds_is_snapshot) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	error = dsl_prop_predict(ds->ds_dir,
 	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	/*
 	 * If we are doing the preliminary check in open context, the
 	 * space estimates may be inaccurate.
 	 */
 	if (!dmu_tx_is_syncing(tx)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (0);
 	}
 
 	mutex_enter(&ds->ds_lock);
 	if (!DS_UNIQUE_IS_ACCURATE(ds))
 		dsl_dataset_recalc_head_uniq(ds);
 	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
 	mutex_exit(&ds->ds_lock);
 
 	if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
 		uint64_t delta = MAX(unique, newval) -
 		    MAX(unique, ds->ds_reserved);
 
 		if (delta >
 		    dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
 		    (ds->ds_quota > 0 && newval > ds->ds_quota)) {
 			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(ENOSPC));
 		}
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 void
 dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
     zprop_source_t source, uint64_t value, dmu_tx_t *tx)
 {
 	uint64_t newval;
 	uint64_t unique;
 	int64_t delta;
 
 	dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
 	    source, sizeof (value), 1, &value, tx);
 
 	VERIFY0(dsl_prop_get_int_ds(ds,
 	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	mutex_enter(&ds->ds_dir->dd_lock);
 	mutex_enter(&ds->ds_lock);
 	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
 	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
 	delta = MAX(0, (int64_t)(newval - unique)) -
 	    MAX(0, (int64_t)(ds->ds_reserved - unique));
 	ds->ds_reserved = newval;
 	mutex_exit(&ds->ds_lock);
 
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
 	mutex_exit(&ds->ds_dir->dd_lock);
 }
 
 static void
 dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds = NULL;
 
 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 	dsl_dataset_set_refreservation_sync_impl(ds,
 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
     uint64_t refreservation)
 {
 	dsl_dataset_set_qr_arg_t ddsqra;
 
 	ddsqra.ddsqra_name = dsname;
 	ddsqra.ddsqra_source = source;
 	ddsqra.ddsqra_value = refreservation;
 
 	return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
 	    dsl_dataset_set_refreservation_sync, &ddsqra, 0,
 	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 typedef struct dsl_dataset_set_compression_arg {
 	const char *ddsca_name;
 	zprop_source_t ddsca_source;
 	uint64_t ddsca_value;
 } dsl_dataset_set_compression_arg_t;
 
 static int
 dsl_dataset_set_compression_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_set_compression_arg_t *ddsca = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 
 	uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value);
 	spa_feature_t f = zio_compress_to_feature(compval);
 
 	if (f == SPA_FEATURE_NONE)
 		return (SET_ERROR(EINVAL));
 
 	if (!spa_feature_is_enabled(dp->dp_spa, f))
 		return (SET_ERROR(ENOTSUP));
 
 	return (0);
 }
 
 static void
 dsl_dataset_set_compression_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dataset_set_compression_arg_t *ddsca = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds = NULL;
 
 	uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value);
 	spa_feature_t f = zio_compress_to_feature(compval);
 	ASSERT3S(f, !=, SPA_FEATURE_NONE);
 	ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN);
 
 	VERIFY0(dsl_dataset_hold(dp, ddsca->ddsca_name, FTAG, &ds));
 	if (zfeature_active(f, ds->ds_feature[f]) != B_TRUE) {
 		ds->ds_feature_activation[f] = (void *)B_TRUE;
 		dsl_dataset_activate_feature(ds->ds_object, f,
 		    ds->ds_feature_activation[f], tx);
 		ds->ds_feature[f] = ds->ds_feature_activation[f];
 	}
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dataset_set_compression(const char *dsname, zprop_source_t source,
     uint64_t compression)
 {
 	dsl_dataset_set_compression_arg_t ddsca;
 
 	/*
 	 * The sync task is only required for zstd in order to activate
 	 * the feature flag when the property is first set.
 	 */
 	if (ZIO_COMPRESS_ALGO(compression) != ZIO_COMPRESS_ZSTD)
 		return (0);
 
 	ddsca.ddsca_name = dsname;
 	ddsca.ddsca_source = source;
 	ddsca.ddsca_value = compression;
 
 	return (dsl_sync_task(dsname, dsl_dataset_set_compression_check,
 	    dsl_dataset_set_compression_sync, &ddsca, 0,
 	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 /*
  * Return (in *usedp) the amount of space referenced by "new" that was not
  * referenced at the time the bookmark corresponds to.  "New" may be a
  * snapshot or a head.  The bookmark must be before new, in
  * new's filesystem (or its origin) -- caller verifies this.
  *
  * The written space is calculated by considering two components:  First, we
  * ignore any freed space, and calculate the written as new's used space
  * minus old's used space.  Next, we add in the amount of space that was freed
  * between the two time points, thus reducing new's used space relative to
  * old's. Specifically, this is the space that was born before
  * zbm_creation_txg, and freed before new (ie. on new's deadlist or a
  * previous deadlist).
  *
  * space freed                         [---------------------]
  * snapshots                       ---O-------O--------O-------O------
  *                                         bookmark           new
  *
  * Note, the bookmark's zbm_*_bytes_refd must be valid, but if the HAS_FBN
  * flag is not set, we will calculate the freed_before_next based on the
  * next snapshot's deadlist, rather than using zbm_*_freed_before_next_snap.
  */
 static int
 dsl_dataset_space_written_impl(zfs_bookmark_phys_t *bmp,
     dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	int err = 0;
 	dsl_pool_t *dp = new->ds_dir->dd_pool;
 
 	ASSERT(dsl_pool_config_held(dp));
 	if (dsl_dataset_is_snapshot(new)) {
 		ASSERT3U(bmp->zbm_creation_txg, <,
 		    dsl_dataset_phys(new)->ds_creation_txg);
 	}
 
 	*usedp = 0;
 	*usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
 	*usedp -= bmp->zbm_referenced_bytes_refd;
 
 	*compp = 0;
 	*compp += dsl_dataset_phys(new)->ds_compressed_bytes;
 	*compp -= bmp->zbm_compressed_bytes_refd;
 
 	*uncompp = 0;
 	*uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
 	*uncompp -= bmp->zbm_uncompressed_bytes_refd;
 
 	dsl_dataset_t *snap = new;
 
 	while (dsl_dataset_phys(snap)->ds_prev_snap_txg >
 	    bmp->zbm_creation_txg) {
 		uint64_t used, comp, uncomp;
 
 		dsl_deadlist_space_range(&snap->ds_deadlist,
 		    0, bmp->zbm_creation_txg,
 		    &used, &comp, &uncomp);
 		*usedp += used;
 		*compp += comp;
 		*uncompp += uncomp;
 
 		uint64_t snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 		if (snap != new)
 			dsl_dataset_rele(snap, FTAG);
 		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
 		if (err != 0)
 			break;
 	}
 
 	/*
 	 * We might not have the FBN if we are calculating written from
 	 * a snapshot (because we didn't know the correct "next" snapshot
 	 * until now).
 	 */
 	if (bmp->zbm_flags & ZBM_FLAG_HAS_FBN) {
 		*usedp += bmp->zbm_referenced_freed_before_next_snap;
 		*compp += bmp->zbm_compressed_freed_before_next_snap;
 		*uncompp += bmp->zbm_uncompressed_freed_before_next_snap;
 	} else {
 		ASSERT3U(dsl_dataset_phys(snap)->ds_prev_snap_txg, ==,
 		    bmp->zbm_creation_txg);
 		uint64_t used, comp, uncomp;
 		dsl_deadlist_space(&snap->ds_deadlist, &used, &comp, &uncomp);
 		*usedp += used;
 		*compp += comp;
 		*uncompp += uncomp;
 	}
 	if (snap != new)
 		dsl_dataset_rele(snap, FTAG);
 	return (err);
 }
 
 /*
  * Return (in *usedp) the amount of space written in new that was not
  * present at the time the bookmark corresponds to.  New may be a
  * snapshot or the head.  Old must be a bookmark before new, in
  * new's filesystem (or its origin) -- caller verifies this.
  */
 int
 dsl_dataset_space_written_bookmark(zfs_bookmark_phys_t *bmp,
     dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	if (!(bmp->zbm_flags & ZBM_FLAG_HAS_FBN))
 		return (SET_ERROR(ENOTSUP));
 	return (dsl_dataset_space_written_impl(bmp, new,
 	    usedp, compp, uncompp));
 }
 
 /*
  * Return (in *usedp) the amount of space written in new that is not
  * present in oldsnap.  New may be a snapshot or the head.  Old must be
  * a snapshot before new, in new's filesystem (or its origin).  If not then
  * fail and return EINVAL.
  */
 int
 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	if (!dsl_dataset_is_before(new, oldsnap, 0))
 		return (SET_ERROR(EINVAL));
 
 	zfs_bookmark_phys_t zbm = { 0 };
 	dsl_dataset_phys_t *dsp = dsl_dataset_phys(oldsnap);
 	zbm.zbm_guid = dsp->ds_guid;
 	zbm.zbm_creation_txg = dsp->ds_creation_txg;
 	zbm.zbm_creation_time = dsp->ds_creation_time;
 	zbm.zbm_referenced_bytes_refd = dsp->ds_referenced_bytes;
 	zbm.zbm_compressed_bytes_refd = dsp->ds_compressed_bytes;
 	zbm.zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes;
 
 	/*
 	 * If oldsnap is the origin (or origin's origin, ...) of new,
 	 * we can't easily calculate the effective FBN.  Therefore,
 	 * we do not set ZBM_FLAG_HAS_FBN, so that the _impl will calculate
 	 * it relative to the correct "next": the next snapshot towards "new",
 	 * rather than the next snapshot in oldsnap's dsl_dir.
 	 */
 	return (dsl_dataset_space_written_impl(&zbm, new,
 	    usedp, compp, uncompp));
 }
 
 /*
  * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
  * lastsnap, and all snapshots in between are deleted.
  *
  * blocks that would be freed            [---------------------------]
  * snapshots                       ---O-------O--------O-------O--------O
  *                                        firstsnap        lastsnap
  *
  * This is the set of blocks that were born after the snap before firstsnap,
  * (birth > firstsnap->prev_snap_txg) and died before the snap after the
  * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
  * We calculate this by iterating over the relevant deadlists (from the snap
  * after lastsnap, backward to the snap after firstsnap), summing up the
  * space on the deadlist that was born after the snap before firstsnap.
  */
 int
 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
     dsl_dataset_t *lastsnap,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	int err = 0;
 	uint64_t snapobj;
 	dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
 
 	ASSERT(firstsnap->ds_is_snapshot);
 	ASSERT(lastsnap->ds_is_snapshot);
 
 	/*
 	 * Check that the snapshots are in the same dsl_dir, and firstsnap
 	 * is before lastsnap.
 	 */
 	if (firstsnap->ds_dir != lastsnap->ds_dir ||
 	    dsl_dataset_phys(firstsnap)->ds_creation_txg >
 	    dsl_dataset_phys(lastsnap)->ds_creation_txg)
 		return (SET_ERROR(EINVAL));
 
 	*usedp = *compp = *uncompp = 0;
 
 	snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
 	while (snapobj != firstsnap->ds_object) {
 		dsl_dataset_t *ds;
 		uint64_t used, comp, uncomp;
 
 		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
 		if (err != 0)
 			break;
 
 		dsl_deadlist_space_range(&ds->ds_deadlist,
 		    dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
 		    &used, &comp, &uncomp);
 		*usedp += used;
 		*compp += comp;
 		*uncompp += uncomp;
 
 		snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		ASSERT3U(snapobj, !=, 0);
 		dsl_dataset_rele(ds, FTAG);
 	}
 	return (err);
 }
 
 /*
  * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
  * For example, they could both be snapshots of the same filesystem, and
  * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
  * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
  * filesystem.  Or 'earlier' could be the origin's origin.
  *
  * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
  */
 boolean_t
 dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
     uint64_t earlier_txg)
 {
 	dsl_pool_t *dp = later->ds_dir->dd_pool;
 	int error;
 	boolean_t ret;
 
 	ASSERT(dsl_pool_config_held(dp));
 	ASSERT(earlier->ds_is_snapshot || earlier_txg != 0);
 
 	if (earlier_txg == 0)
 		earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
 
 	if (later->ds_is_snapshot &&
 	    earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
 		return (B_FALSE);
 
 	if (later->ds_dir == earlier->ds_dir)
 		return (B_TRUE);
 
 	/*
 	 * We check dd_origin_obj explicitly here rather than using
 	 * dsl_dir_is_clone() so that we will return TRUE if "earlier"
 	 * is $ORIGIN@$ORIGIN.  dsl_dataset_space_written() depends on
 	 * this behavior.
 	 */
 	if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == 0)
 		return (B_FALSE);
 
 	dsl_dataset_t *origin;
 	error = dsl_dataset_hold_obj(dp,
 	    dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
 	if (error != 0)
 		return (B_FALSE);
 	if (dsl_dataset_phys(origin)->ds_creation_txg == earlier_txg &&
 	    origin->ds_dir == earlier->ds_dir) {
 		dsl_dataset_rele(origin, FTAG);
 		return (B_TRUE);
 	}
 	ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
 	dsl_dataset_rele(origin, FTAG);
 	return (ret);
 }
 
 void
 dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
 }
 
 boolean_t
 dsl_dataset_is_zapified(dsl_dataset_t *ds)
 {
 	dmu_object_info_t doi;
 
 	dmu_object_info_from_db(ds->ds_dbuf, &doi);
 	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
 }
 
 boolean_t
 dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
 {
 	return (dsl_dataset_is_zapified(ds) &&
 	    zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
 	    ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
 }
 
 uint64_t
 dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds)
 {
 	uint64_t remap_deadlist_obj;
 	int err;
 
 	if (!dsl_dataset_is_zapified(ds))
 		return (0);
 
 	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1,
 	    &remap_deadlist_obj);
 
 	if (err != 0) {
 		VERIFY3S(err, ==, ENOENT);
 		return (0);
 	}
 
 	ASSERT(remap_deadlist_obj != 0);
 	return (remap_deadlist_obj);
 }
 
 boolean_t
 dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds)
 {
 	EQUIV(dsl_deadlist_is_open(&ds->ds_remap_deadlist),
 	    dsl_dataset_get_remap_deadlist_object(ds) != 0);
 	return (dsl_deadlist_is_open(&ds->ds_remap_deadlist));
 }
 
 static void
 dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj,
     dmu_tx_t *tx)
 {
 	ASSERT(obj != 0);
 	dsl_dataset_zapify(ds, tx);
 	VERIFY0(zap_add(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_REMAP_DEADLIST, sizeof (obj), 1, &obj, tx));
 }
 
 static void
 dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	VERIFY0(zap_remove(ds->ds_dir->dd_pool->dp_meta_objset,
 	    ds->ds_object, DS_FIELD_REMAP_DEADLIST, tx));
 }
 
 void
 dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	uint64_t remap_deadlist_object;
 	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dsl_dataset_remap_deadlist_exists(ds));
 
 	remap_deadlist_object = ds->ds_remap_deadlist.dl_object;
 	dsl_deadlist_close(&ds->ds_remap_deadlist);
 	dsl_deadlist_free(spa_meta_objset(spa), remap_deadlist_object, tx);
 	dsl_dataset_unset_remap_deadlist_object(ds, tx);
 	spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 }
 
 void
 dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	uint64_t remap_deadlist_obj;
 	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(MUTEX_HELD(&ds->ds_remap_deadlist_lock));
 	/*
 	 * Currently we only create remap deadlists when there are indirect
 	 * vdevs with referenced mappings.
 	 */
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
 
 	remap_deadlist_obj = dsl_deadlist_clone(
 	    &ds->ds_deadlist, UINT64_MAX,
 	    dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
 	dsl_dataset_set_remap_deadlist_object(ds,
 	    remap_deadlist_obj, tx);
 	dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa),
 	    remap_deadlist_obj);
 	spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 }
 
 void
 dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps,
     uint64_t num_redact_snaps, dmu_tx_t *tx)
 {
 	uint64_t dsobj = ds->ds_object;
 	struct feature_type_uint64_array_arg *ftuaa =
 	    kmem_zalloc(sizeof (*ftuaa), KM_SLEEP);
 	ftuaa->length = (int64_t)num_redact_snaps;
 	if (num_redact_snaps > 0) {
 		ftuaa->array = kmem_alloc(num_redact_snaps * sizeof (uint64_t),
 		    KM_SLEEP);
 		memcpy(ftuaa->array, redact_snaps, num_redact_snaps *
 		    sizeof (uint64_t));
 	}
 	dsl_dataset_activate_feature(dsobj, SPA_FEATURE_REDACTED_DATASETS,
 	    ftuaa, tx);
 	ds->ds_feature[SPA_FEATURE_REDACTED_DATASETS] = ftuaa;
 }
 
 /*
  * Find and return (in *oldest_dsobj) the oldest snapshot of the dsobj
  * dataset whose birth time is >= min_txg.
  */
 int
 dsl_dataset_oldest_snapshot(spa_t *spa, uint64_t head_ds, uint64_t min_txg,
     uint64_t *oldest_dsobj)
 {
 	dsl_dataset_t *ds;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 
 	int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 
 	while (prev_obj != 0 && min_txg < prev_obj_txg) {
 		dsl_dataset_rele(ds, FTAG);
 		if ((error = dsl_dataset_hold_obj(dp, prev_obj,
 		    FTAG, &ds)) != 0)
 			return (error);
 		prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 		prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	}
 	*oldest_dsobj = ds->ds_object;
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, UINT, ZMOD_RW,
 	"Max allowed record size");
 
 ZFS_MODULE_PARAM(zfs, zfs_, allow_redacted_dataset_mount, INT, ZMOD_RW,
 	"Allow mounting of redacted datasets");
 
 ZFS_MODULE_PARAM(zfs, zfs_, snapshot_history_enabled, INT, ZMOD_RW,
 	"Include snapshot events in pool history/events");
 
 EXPORT_SYMBOL(dsl_dataset_hold);
 EXPORT_SYMBOL(dsl_dataset_hold_flags);
 EXPORT_SYMBOL(dsl_dataset_hold_obj);
 EXPORT_SYMBOL(dsl_dataset_hold_obj_flags);
 EXPORT_SYMBOL(dsl_dataset_own);
 EXPORT_SYMBOL(dsl_dataset_own_obj);
 EXPORT_SYMBOL(dsl_dataset_name);
 EXPORT_SYMBOL(dsl_dataset_rele);
 EXPORT_SYMBOL(dsl_dataset_rele_flags);
 EXPORT_SYMBOL(dsl_dataset_disown);
 EXPORT_SYMBOL(dsl_dataset_tryown);
 EXPORT_SYMBOL(dsl_dataset_create_sync);
 EXPORT_SYMBOL(dsl_dataset_create_sync_dd);
 EXPORT_SYMBOL(dsl_dataset_snapshot_check);
 EXPORT_SYMBOL(dsl_dataset_snapshot_sync);
 EXPORT_SYMBOL(dsl_dataset_promote);
 EXPORT_SYMBOL(dsl_dataset_user_hold);
 EXPORT_SYMBOL(dsl_dataset_user_release);
 EXPORT_SYMBOL(dsl_dataset_get_holds);
 EXPORT_SYMBOL(dsl_dataset_get_blkptr);
 EXPORT_SYMBOL(dsl_dataset_get_spa);
 EXPORT_SYMBOL(dsl_dataset_modified_since_snap);
 EXPORT_SYMBOL(dsl_dataset_space_written);
 EXPORT_SYMBOL(dsl_dataset_space_wouldfree);
 EXPORT_SYMBOL(dsl_dataset_sync);
 EXPORT_SYMBOL(dsl_dataset_block_born);
 EXPORT_SYMBOL(dsl_dataset_block_kill);
 EXPORT_SYMBOL(dsl_dataset_dirty);
 EXPORT_SYMBOL(dsl_dataset_stats);
 EXPORT_SYMBOL(dsl_dataset_fast_stat);
 EXPORT_SYMBOL(dsl_dataset_space);
 EXPORT_SYMBOL(dsl_dataset_fsid_guid);
 EXPORT_SYMBOL(dsl_dsobj_to_dsname);
 EXPORT_SYMBOL(dsl_dataset_check_quota);
 EXPORT_SYMBOL(dsl_dataset_clone_swap_check_impl);
 EXPORT_SYMBOL(dsl_dataset_clone_swap_sync_impl);
diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c
index e6c8d4be13b4..eff1f7de7731 100644
--- a/module/zfs/dsl_deadlist.c
+++ b/module/zfs/dsl_deadlist.c
@@ -1,1113 +1,1112 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/zfs_context.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 
 /*
  * Deadlist concurrency:
  *
  * Deadlists can only be modified from the syncing thread.
  *
  * Except for dsl_deadlist_insert(), it can only be modified with the
  * dp_config_rwlock held with RW_WRITER.
  *
  * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can
  * be called concurrently, from open context, with the dl_config_rwlock held
  * with RW_READER.
  *
  * Therefore, we only need to provide locking between dsl_deadlist_insert() and
  * the accessors, protecting:
  *     dl_phys->dl_used,comp,uncomp
  *     and protecting the dl_tree from being loaded.
  * The locking is provided by dl_lock.  Note that locking on the bpobj_t
  * provides its own locking, and dl_oldfmt is immutable.
  */
 
 /*
  * Livelist Overview
  * ================
  *
  * Livelists use the same 'deadlist_t' struct as deadlists and are also used
  * to track blkptrs over the lifetime of a dataset. Livelists however, belong
  * to clones and track the blkptrs that are clone-specific (were born after
  * the clone's creation). The exception is embedded block pointers which are
  * not included in livelists because they do not need to be freed.
  *
  * When it comes time to delete the clone, the livelist provides a quick
  * reference as to what needs to be freed. For this reason, livelists also track
  * when clone-specific blkptrs are freed before deletion to prevent double
  * frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the
  * deletion algorithm iterates backwards over the livelist, matching
  * FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists
  * are also updated in the case when blkptrs are remapped: the old version
  * of the blkptr is cancelled out with a FREE and the new version is tracked
  * with an ALLOC.
  *
  * To bound the amount of memory required for deletion, livelists over a
  * certain size are spread over multiple entries. Entries are grouped by
  * birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will
  * be in the same entry. This allows us to delete livelists incrementally
  * over multiple syncs, one entry at a time.
  *
  * During the lifetime of the clone, livelists can get extremely large.
  * Their size is managed by periodic condensing (preemptively cancelling out
  * FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when
  * the shared space between the clone and its origin is so small that it
  * doesn't make sense to use livelists anymore.
  */
 
 /*
  * The threshold sublist size at which we create a new sub-livelist for the
  * next txg. However, since blkptrs of the same transaction group must be in
  * the same sub-list, the actual sublist size may exceed this. When picking the
  * size we had to balance the fact that larger sublists mean fewer sublists
  * (decreasing the cost of insertion) against the consideration that sublists
  * will be loaded into memory and shouldn't take up an inordinate amount of
  * space. We settled on ~500000 entries, corresponding to roughly 128M.
  */
 uint64_t zfs_livelist_max_entries = 500000;
 
 /*
  * We can approximate how much of a performance gain a livelist will give us
  * based on the percentage of blocks shared between the clone and its origin.
  * 0 percent shared means that the clone has completely diverged and that the
  * old method is maximally effective: every read from the block tree will
  * result in lots of frees. Livelists give us gains when they track blocks
  * scattered across the tree, when one read in the old method might only
  * result in a few frees. Once the clone has been overwritten enough,
  * writes are no longer sparse and we'll no longer get much of a benefit from
  * tracking them with a livelist. We chose a lower limit of 75 percent shared
  * (25 percent overwritten). This means that 1/4 of all block pointers will be
  * freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists
  * to make deletion 4x faster. Once the amount of shared space drops below this
  * threshold, the clone will revert to the old deletion method.
  */
 int zfs_livelist_min_percent_shared = 75;
 
 static int
 dsl_deadlist_compare(const void *arg1, const void *arg2)
 {
 	const dsl_deadlist_entry_t *dle1 = arg1;
 	const dsl_deadlist_entry_t *dle2 = arg2;
 
 	return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
 }
 
 static int
 dsl_deadlist_cache_compare(const void *arg1, const void *arg2)
 {
 	const dsl_deadlist_cache_entry_t *dlce1 = arg1;
 	const dsl_deadlist_cache_entry_t *dlce2 = arg2;
 
 	return (TREE_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg));
 }
 
 static void
 dsl_deadlist_load_tree(dsl_deadlist_t *dl)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	int error;
 
 	ASSERT(MUTEX_HELD(&dl->dl_lock));
 
 	ASSERT(!dl->dl_oldfmt);
 	if (dl->dl_havecache) {
 		/*
 		 * After loading the tree, the caller may modify the tree,
 		 * e.g. to add or remove nodes, or to make a node no longer
 		 * refer to the empty_bpobj.  These changes would make the
 		 * dl_cache incorrect.  Therefore we discard the cache here,
 		 * so that it can't become incorrect.
 		 */
 		dsl_deadlist_cache_entry_t *dlce;
 		void *cookie = NULL;
 		while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))
 		    != NULL) {
 			kmem_free(dlce, sizeof (*dlce));
 		}
 		avl_destroy(&dl->dl_cache);
 		dl->dl_havecache = B_FALSE;
 	}
 	if (dl->dl_havetree)
 		return;
 
 	avl_create(&dl->dl_tree, dsl_deadlist_compare,
 	    sizeof (dsl_deadlist_entry_t),
 	    offsetof(dsl_deadlist_entry_t, dle_node));
 	for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
 	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
 		dle->dle_mintxg = zfs_strtonum(za.za_name, NULL);
 
 		/*
 		 * Prefetch all the bpobj's so that we do that i/o
 		 * in parallel.  Then open them all in a second pass.
 		 */
 		dle->dle_bpobj.bpo_object = za.za_first_integer;
 		dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object,
 		    ZIO_PRIORITY_SYNC_READ);
 
 		avl_add(&dl->dl_tree, dle);
 	}
 	VERIFY3U(error, ==, ENOENT);
 	zap_cursor_fini(&zc);
 
 	for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree);
 	    dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) {
 		VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os,
 		    dle->dle_bpobj.bpo_object));
 	}
 	dl->dl_havetree = B_TRUE;
 }
 
 /*
  * Load only the non-empty bpobj's into the dl_cache.  The cache is an analog
  * of the dl_tree, but contains only non-empty_bpobj nodes from the ZAP. It
  * is used only for gathering space statistics.  The dl_cache has two
  * advantages over the dl_tree:
  *
  * 1. Loading the dl_cache is ~5x faster than loading the dl_tree (if it's
  * mostly empty_bpobj's), due to less CPU overhead to open the empty_bpobj
  * many times and to inquire about its (zero) space stats many times.
  *
  * 2. The dl_cache uses less memory than the dl_tree.  We only need to load
  * the dl_tree of snapshots when deleting a snapshot, after which we free the
  * dl_tree with dsl_deadlist_discard_tree
  */
 static void
 dsl_deadlist_load_cache(dsl_deadlist_t *dl)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	int error;
 
 	ASSERT(MUTEX_HELD(&dl->dl_lock));
 
 	ASSERT(!dl->dl_oldfmt);
 	if (dl->dl_havecache)
 		return;
 
 	uint64_t empty_bpobj = dmu_objset_pool(dl->dl_os)->dp_empty_bpobj;
 
 	avl_create(&dl->dl_cache, dsl_deadlist_cache_compare,
 	    sizeof (dsl_deadlist_cache_entry_t),
 	    offsetof(dsl_deadlist_cache_entry_t, dlce_node));
 	for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
 	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		if (za.za_first_integer == empty_bpobj)
 			continue;
 		dsl_deadlist_cache_entry_t *dlce =
 		    kmem_zalloc(sizeof (*dlce), KM_SLEEP);
 		dlce->dlce_mintxg = zfs_strtonum(za.za_name, NULL);
 
 		/*
 		 * Prefetch all the bpobj's so that we do that i/o
 		 * in parallel.  Then open them all in a second pass.
 		 */
 		dlce->dlce_bpobj = za.za_first_integer;
 		dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj,
 		    ZIO_PRIORITY_SYNC_READ);
 		avl_add(&dl->dl_cache, dlce);
 	}
 	VERIFY3U(error, ==, ENOENT);
 	zap_cursor_fini(&zc);
 
 	for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache);
 	    dlce != NULL; dlce = AVL_NEXT(&dl->dl_cache, dlce)) {
 		bpobj_t bpo;
 		VERIFY0(bpobj_open(&bpo, dl->dl_os, dlce->dlce_bpobj));
 
 		VERIFY0(bpobj_space(&bpo,
 		    &dlce->dlce_bytes, &dlce->dlce_comp, &dlce->dlce_uncomp));
 		bpobj_close(&bpo);
 	}
 	dl->dl_havecache = B_TRUE;
 }
 
 /*
  * Discard the tree to save memory.
  */
 void
 dsl_deadlist_discard_tree(dsl_deadlist_t *dl)
 {
 	mutex_enter(&dl->dl_lock);
 
 	if (!dl->dl_havetree) {
 		mutex_exit(&dl->dl_lock);
 		return;
 	}
 	dsl_deadlist_entry_t *dle;
 	void *cookie = NULL;
 	while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) {
 		bpobj_close(&dle->dle_bpobj);
 		kmem_free(dle, sizeof (*dle));
 	}
 	avl_destroy(&dl->dl_tree);
 
 	dl->dl_havetree = B_FALSE;
 	mutex_exit(&dl->dl_lock);
 }
 
 void
 dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args)
 {
 	dsl_deadlist_entry_t *dle;
 
 	ASSERT(dsl_deadlist_is_open(dl));
 
 	mutex_enter(&dl->dl_lock);
 	dsl_deadlist_load_tree(dl);
 	mutex_exit(&dl->dl_lock);
 	for (dle = avl_first(&dl->dl_tree); dle != NULL;
 	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
 		if (func(args, dle) != 0)
 			break;
 	}
 }
 
 void
 dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
 {
 	dmu_object_info_t doi;
 
 	ASSERT(!dsl_deadlist_is_open(dl));
 
 	mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
 	dl->dl_os = os;
 	dl->dl_object = object;
 	VERIFY0(dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
 	dmu_object_info_from_db(dl->dl_dbuf, &doi);
 	if (doi.doi_type == DMU_OT_BPOBJ) {
 		dmu_buf_rele(dl->dl_dbuf, dl);
 		dl->dl_dbuf = NULL;
 		dl->dl_oldfmt = B_TRUE;
 		VERIFY0(bpobj_open(&dl->dl_bpobj, os, object));
 		return;
 	}
 
 	dl->dl_oldfmt = B_FALSE;
 	dl->dl_phys = dl->dl_dbuf->db_data;
 	dl->dl_havetree = B_FALSE;
 	dl->dl_havecache = B_FALSE;
 }
 
 boolean_t
 dsl_deadlist_is_open(dsl_deadlist_t *dl)
 {
 	return (dl->dl_os != NULL);
 }
 
 void
 dsl_deadlist_close(dsl_deadlist_t *dl)
 {
 	ASSERT(dsl_deadlist_is_open(dl));
 	mutex_destroy(&dl->dl_lock);
 
 	if (dl->dl_oldfmt) {
 		dl->dl_oldfmt = B_FALSE;
 		bpobj_close(&dl->dl_bpobj);
 		dl->dl_os = NULL;
 		dl->dl_object = 0;
 		return;
 	}
 
 	if (dl->dl_havetree) {
 		dsl_deadlist_entry_t *dle;
 		void *cookie = NULL;
 		while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
 		    != NULL) {
 			bpobj_close(&dle->dle_bpobj);
 			kmem_free(dle, sizeof (*dle));
 		}
 		avl_destroy(&dl->dl_tree);
 	}
 	if (dl->dl_havecache) {
 		dsl_deadlist_cache_entry_t *dlce;
 		void *cookie = NULL;
 		while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))
 		    != NULL) {
 			kmem_free(dlce, sizeof (*dlce));
 		}
 		avl_destroy(&dl->dl_cache);
 	}
 	dmu_buf_rele(dl->dl_dbuf, dl);
 	dl->dl_dbuf = NULL;
 	dl->dl_phys = NULL;
 	dl->dl_os = NULL;
 	dl->dl_object = 0;
 }
 
 uint64_t
 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
 {
 	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
 		return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
 	return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
 	    sizeof (dsl_deadlist_phys_t), tx));
 }
 
 void
 dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
 {
 	dmu_object_info_t doi;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	int error;
 
 	VERIFY0(dmu_object_info(os, dlobj, &doi));
 	if (doi.doi_type == DMU_OT_BPOBJ) {
 		bpobj_free(os, dlobj, tx);
 		return;
 	}
 
 	for (zap_cursor_init(&zc, os, dlobj);
 	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t obj = za.za_first_integer;
 		if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
 			bpobj_decr_empty(os, tx);
 		else
 			bpobj_free(os, obj, tx);
 	}
 	VERIFY3U(error, ==, ENOENT);
 	zap_cursor_fini(&zc);
 	VERIFY0(dmu_object_free(os, dlobj, tx));
 }
 
 static void
 dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
     const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 {
 	ASSERT(MUTEX_HELD(&dl->dl_lock));
 	if (dle->dle_bpobj.bpo_object ==
 	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
 		uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
 		bpobj_close(&dle->dle_bpobj);
 		bpobj_decr_empty(dl->dl_os, tx);
 		VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
 		VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,
 		    dle->dle_mintxg, obj, tx));
 	}
 	bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx);
 }
 
 static void
 dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
     uint64_t obj, dmu_tx_t *tx)
 {
 	ASSERT(MUTEX_HELD(&dl->dl_lock));
 	if (dle->dle_bpobj.bpo_object !=
 	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
 		bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
 	} else {
 		bpobj_close(&dle->dle_bpobj);
 		bpobj_decr_empty(dl->dl_os, tx);
 		VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
 		VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,
 		    dle->dle_mintxg, obj, tx));
 	}
 }
 
 /*
  * Prefetch metadata required for dle_enqueue_subobj().
  */
 static void
 dle_prefetch_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
     uint64_t obj)
 {
 	if (dle->dle_bpobj.bpo_object !=
 	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj)
 		bpobj_prefetch_subobj(&dle->dle_bpobj, obj);
 }
 
 void
 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t dle_tofind;
 	dsl_deadlist_entry_t *dle;
 	avl_index_t where;
 
 	if (dl->dl_oldfmt) {
 		bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx);
 		return;
 	}
 
 	mutex_enter(&dl->dl_lock);
 	dsl_deadlist_load_tree(dl);
 
 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
 
 	int sign = bp_freed ? -1 : +1;
 	dl->dl_phys->dl_used +=
 	    sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
 	dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);
 	dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);
 
-	dle_tofind.dle_mintxg = bp->blk_birth;
+	dle_tofind.dle_mintxg = BP_GET_LOGICAL_BIRTH(bp);
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
 	else
 		dle = AVL_PREV(&dl->dl_tree, dle);
 
 	if (dle == NULL) {
 		zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu",
-		    bp, (longlong_t)bp->blk_birth);
+		    bp, (longlong_t)BP_GET_LOGICAL_BIRTH(bp));
 		dle = avl_first(&dl->dl_tree);
 	}
 
 	ASSERT3P(dle, !=, NULL);
 	dle_enqueue(dl, dle, bp, bp_freed, tx);
 	mutex_exit(&dl->dl_lock);
 }
 
 int
 dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	dsl_deadlist_t *dl = arg;
 	dsl_deadlist_insert(dl, bp, B_FALSE, tx);
 	return (0);
 }
 
 int
 dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	dsl_deadlist_t *dl = arg;
 	dsl_deadlist_insert(dl, bp, B_TRUE, tx);
 	return (0);
 }
 
 /*
  * Insert new key in deadlist, which must be > all current entries.
  * mintxg is not inclusive.
  */
 void
 dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 {
 	uint64_t obj;
 	dsl_deadlist_entry_t *dle;
 
 	if (dl->dl_oldfmt)
 		return;
 
 	dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
 	dle->dle_mintxg = mintxg;
 
 	mutex_enter(&dl->dl_lock);
 	dsl_deadlist_load_tree(dl);
 
 	obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
 	VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
 	avl_add(&dl->dl_tree, dle);
 
 	VERIFY0(zap_add_int_key(dl->dl_os, dl->dl_object,
 	    mintxg, obj, tx));
 	mutex_exit(&dl->dl_lock);
 }
 
 /*
  * Remove this key, merging its entries into the previous key.
  */
 void
 dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t dle_tofind;
 	dsl_deadlist_entry_t *dle, *dle_prev;
 
 	if (dl->dl_oldfmt)
 		return;
 	mutex_enter(&dl->dl_lock);
 	dsl_deadlist_load_tree(dl);
 
 	dle_tofind.dle_mintxg = mintxg;
 	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
 	ASSERT3P(dle, !=, NULL);
 	dle_prev = AVL_PREV(&dl->dl_tree, dle);
 	ASSERT3P(dle_prev, !=, NULL);
 
 	dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
 
 	avl_remove(&dl->dl_tree, dle);
 	bpobj_close(&dle->dle_bpobj);
 	kmem_free(dle, sizeof (*dle));
 
 	VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
 	mutex_exit(&dl->dl_lock);
 }
 
 /*
  * Remove a deadlist entry and all of its contents by removing the entry from
  * the deadlist's avl tree, freeing the entry's bpobj and adjusting the
  * deadlist's space accounting accordingly.
  */
 void
 dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 {
 	uint64_t used, comp, uncomp;
 	dsl_deadlist_entry_t dle_tofind;
 	dsl_deadlist_entry_t *dle;
 	objset_t *os = dl->dl_os;
 
 	if (dl->dl_oldfmt)
 		return;
 
 	mutex_enter(&dl->dl_lock);
 	dsl_deadlist_load_tree(dl);
 
 	dle_tofind.dle_mintxg = mintxg;
 	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
 	VERIFY3P(dle, !=, NULL);
 
 	avl_remove(&dl->dl_tree, dle);
 	VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx));
 	VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
 	dl->dl_phys->dl_used -= used;
 	dl->dl_phys->dl_comp -= comp;
 	dl->dl_phys->dl_uncomp -= uncomp;
 	if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) {
 		bpobj_decr_empty(os, tx);
 	} else {
 		bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
 	}
 	bpobj_close(&dle->dle_bpobj);
 	kmem_free(dle, sizeof (*dle));
 	mutex_exit(&dl->dl_lock);
 }
 
 /*
  * Clear out the contents of a deadlist_entry by freeing its bpobj,
  * replacing it with an empty bpobj and adjusting the deadlist's
  * space accounting
  */
 void
 dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
     dmu_tx_t *tx)
 {
 	uint64_t new_obj, used, comp, uncomp;
 	objset_t *os = dl->dl_os;
 
 	mutex_enter(&dl->dl_lock);
 	VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx));
 	VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
 	dl->dl_phys->dl_used -= used;
 	dl->dl_phys->dl_comp -= comp;
 	dl->dl_phys->dl_uncomp -= uncomp;
 	if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj)
 		bpobj_decr_empty(os, tx);
 	else
 		bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
 	bpobj_close(&dle->dle_bpobj);
 	new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx);
 	VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj));
 	VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg,
 	    new_obj, tx));
 	ASSERT(bpobj_is_empty(&dle->dle_bpobj));
 	mutex_exit(&dl->dl_lock);
 }
 
 /*
  * Return the first entry in deadlist's avl tree
  */
 dsl_deadlist_entry_t *
 dsl_deadlist_first(dsl_deadlist_t *dl)
 {
 	dsl_deadlist_entry_t *dle;
 
 	mutex_enter(&dl->dl_lock);
 	dsl_deadlist_load_tree(dl);
 	dle = avl_first(&dl->dl_tree);
 	mutex_exit(&dl->dl_lock);
 
 	return (dle);
 }
 
 /*
  * Return the last entry in deadlist's avl tree
  */
 dsl_deadlist_entry_t *
 dsl_deadlist_last(dsl_deadlist_t *dl)
 {
 	dsl_deadlist_entry_t *dle;
 
 	mutex_enter(&dl->dl_lock);
 	dsl_deadlist_load_tree(dl);
 	dle = avl_last(&dl->dl_tree);
 	mutex_exit(&dl->dl_lock);
 
 	return (dle);
 }
 
 /*
  * Walk ds's snapshots to regenerate generate ZAP & AVL.
  */
 static void
 dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
     uint64_t mrs_obj, dmu_tx_t *tx)
 {
 	dsl_deadlist_t dl = { 0 };
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
 	dsl_deadlist_open(&dl, os, dlobj);
 	if (dl.dl_oldfmt) {
 		dsl_deadlist_close(&dl);
 		return;
 	}
 
 	while (mrs_obj != 0) {
 		dsl_dataset_t *ds;
 		VERIFY0(dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
 		dsl_deadlist_add_key(&dl,
 		    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
 		mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		dsl_dataset_rele(ds, FTAG);
 	}
 	dsl_deadlist_close(&dl);
 }
 
 uint64_t
 dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
     uint64_t mrs_obj, dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t *dle;
 	uint64_t newobj;
 
 	newobj = dsl_deadlist_alloc(dl->dl_os, tx);
 
 	if (dl->dl_oldfmt) {
 		dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
 		return (newobj);
 	}
 
 	mutex_enter(&dl->dl_lock);
 	dsl_deadlist_load_tree(dl);
 
 	for (dle = avl_first(&dl->dl_tree); dle;
 	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
 		uint64_t obj;
 
 		if (dle->dle_mintxg >= maxtxg)
 			break;
 
 		obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
 		VERIFY0(zap_add_int_key(dl->dl_os, newobj,
 		    dle->dle_mintxg, obj, tx));
 	}
 	mutex_exit(&dl->dl_lock);
 	return (newobj);
 }
 
 void
 dsl_deadlist_space(dsl_deadlist_t *dl,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	ASSERT(dsl_deadlist_is_open(dl));
 	if (dl->dl_oldfmt) {
 		VERIFY0(bpobj_space(&dl->dl_bpobj,
 		    usedp, compp, uncompp));
 		return;
 	}
 
 	mutex_enter(&dl->dl_lock);
 	*usedp = dl->dl_phys->dl_used;
 	*compp = dl->dl_phys->dl_comp;
 	*uncompp = dl->dl_phys->dl_uncomp;
 	mutex_exit(&dl->dl_lock);
 }
 
 /*
  * return space used in the range (mintxg, maxtxg].
  * Includes maxtxg, does not include mintxg.
  * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
  * UINT64_MAX).
  */
 void
 dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	dsl_deadlist_cache_entry_t *dlce;
 	dsl_deadlist_cache_entry_t dlce_tofind;
 	avl_index_t where;
 
 	if (dl->dl_oldfmt) {
 		VERIFY0(bpobj_space_range(&dl->dl_bpobj,
 		    mintxg, maxtxg, usedp, compp, uncompp));
 		return;
 	}
 
 	*usedp = *compp = *uncompp = 0;
 
 	mutex_enter(&dl->dl_lock);
 	dsl_deadlist_load_cache(dl);
 	dlce_tofind.dlce_mintxg = mintxg;
 	dlce = avl_find(&dl->dl_cache, &dlce_tofind, &where);
 
 	/*
 	 * If this mintxg doesn't exist, it may be an empty_bpobj which
 	 * is omitted from the sparse tree.  Start at the next non-empty
 	 * entry.
 	 */
 	if (dlce == NULL)
 		dlce = avl_nearest(&dl->dl_cache, where, AVL_AFTER);
 
 	for (; dlce && dlce->dlce_mintxg < maxtxg;
 	    dlce = AVL_NEXT(&dl->dl_tree, dlce)) {
 		*usedp += dlce->dlce_bytes;
 		*compp += dlce->dlce_comp;
 		*uncompp += dlce->dlce_uncomp;
 	}
 
 	mutex_exit(&dl->dl_lock);
 }
 
 static void
 dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
     dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t dle_tofind;
 	dsl_deadlist_entry_t *dle;
 	avl_index_t where;
 	uint64_t used, comp, uncomp;
 	bpobj_t bpo;
 
 	ASSERT(MUTEX_HELD(&dl->dl_lock));
 
 	VERIFY0(bpobj_open(&bpo, dl->dl_os, obj));
 	VERIFY0(bpobj_space(&bpo, &used, &comp, &uncomp));
 	bpobj_close(&bpo);
 
 	dsl_deadlist_load_tree(dl);
 
 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
 	dl->dl_phys->dl_used += used;
 	dl->dl_phys->dl_comp += comp;
 	dl->dl_phys->dl_uncomp += uncomp;
 
 	dle_tofind.dle_mintxg = birth;
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
 	dle_enqueue_subobj(dl, dle, obj, tx);
 }
 
 /*
  * Prefetch metadata required for dsl_deadlist_insert_bpobj().
  */
 static void
 dsl_deadlist_prefetch_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth)
 {
 	dsl_deadlist_entry_t dle_tofind;
 	dsl_deadlist_entry_t *dle;
 	avl_index_t where;
 
 	ASSERT(MUTEX_HELD(&dl->dl_lock));
 
 	dsl_deadlist_load_tree(dl);
 
 	dle_tofind.dle_mintxg = birth;
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
 	dle_prefetch_subobj(dl, dle, obj);
 }
 
 static int
 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	dsl_deadlist_t *dl = arg;
 	dsl_deadlist_insert(dl, bp, bp_freed, tx);
 	return (0);
 }
 
 /*
  * Merge the deadlist pointed to by 'obj' into dl.  obj will be left as
  * an empty deadlist.
  */
 void
 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
 {
 	zap_cursor_t zc, pzc;
 	zap_attribute_t *za, *pza;
 	dmu_buf_t *bonus;
 	dsl_deadlist_phys_t *dlp;
 	dmu_object_info_t doi;
 	int error, perror, i;
 
 	VERIFY0(dmu_object_info(dl->dl_os, obj, &doi));
 	if (doi.doi_type == DMU_OT_BPOBJ) {
 		bpobj_t bpo;
 		VERIFY0(bpobj_open(&bpo, dl->dl_os, obj));
 		VERIFY0(bpobj_iterate(&bpo, dsl_deadlist_insert_cb, dl, tx));
 		bpobj_close(&bpo);
 		return;
 	}
 
 	za = kmem_alloc(sizeof (*za), KM_SLEEP);
 	pza = kmem_alloc(sizeof (*pza), KM_SLEEP);
 
 	mutex_enter(&dl->dl_lock);
 	/*
 	 * Prefetch up to 128 deadlists first and then more as we progress.
 	 * The limit is a balance between ARC use and diminishing returns.
 	 */
 	for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0;
 	    (perror = zap_cursor_retrieve(&pzc, pza)) == 0 && i < 128;
 	    zap_cursor_advance(&pzc), i++) {
 		dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,
 		    zfs_strtonum(pza->za_name, NULL));
 	}
 	for (zap_cursor_init(&zc, dl->dl_os, obj);
 	    (error = zap_cursor_retrieve(&zc, za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_deadlist_insert_bpobj(dl, za->za_first_integer,
 		    zfs_strtonum(za->za_name, NULL), tx);
 		VERIFY0(zap_remove(dl->dl_os, obj, za->za_name, tx));
 		if (perror == 0) {
 			dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,
 			    zfs_strtonum(pza->za_name, NULL));
 			zap_cursor_advance(&pzc);
 			perror = zap_cursor_retrieve(&pzc, pza);
 		}
 	}
 	VERIFY3U(error, ==, ENOENT);
 	zap_cursor_fini(&zc);
 	zap_cursor_fini(&pzc);
 
 	VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
 	dlp = bonus->db_data;
 	dmu_buf_will_dirty(bonus, tx);
 	memset(dlp, 0, sizeof (*dlp));
 	dmu_buf_rele(bonus, FTAG);
 	mutex_exit(&dl->dl_lock);
 
 	kmem_free(za, sizeof (*za));
 	kmem_free(pza, sizeof (*pza));
 }
 
 /*
  * Remove entries on dl that are born > mintxg, and put them on the bpobj.
  */
 void
 dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
     dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t dle_tofind;
 	dsl_deadlist_entry_t *dle, *pdle;
 	avl_index_t where;
 	int i;
 
 	ASSERT(!dl->dl_oldfmt);
 
 	mutex_enter(&dl->dl_lock);
 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
 	dsl_deadlist_load_tree(dl);
 
 	dle_tofind.dle_mintxg = mintxg;
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
 	/*
 	 * Prefetch up to 128 deadlists first and then more as we progress.
 	 * The limit is a balance between ARC use and diminishing returns.
 	 */
 	for (pdle = dle, i = 0; pdle && i < 128; i++) {
 		bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
 		pdle = AVL_NEXT(&dl->dl_tree, pdle);
 	}
 	while (dle) {
 		uint64_t used, comp, uncomp;
 		dsl_deadlist_entry_t *dle_next;
 
 		bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
 		if (pdle) {
 			bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
 			pdle = AVL_NEXT(&dl->dl_tree, pdle);
 		}
 
 		VERIFY0(bpobj_space(&dle->dle_bpobj,
 		    &used, &comp, &uncomp));
 		ASSERT3U(dl->dl_phys->dl_used, >=, used);
 		ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
 		ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
 		dl->dl_phys->dl_used -= used;
 		dl->dl_phys->dl_comp -= comp;
 		dl->dl_phys->dl_uncomp -= uncomp;
 
 		VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object,
 		    dle->dle_mintxg, tx));
 
 		dle_next = AVL_NEXT(&dl->dl_tree, dle);
 		avl_remove(&dl->dl_tree, dle);
 		bpobj_close(&dle->dle_bpobj);
 		kmem_free(dle, sizeof (*dle));
 		dle = dle_next;
 	}
 	mutex_exit(&dl->dl_lock);
 }
 
 typedef struct livelist_entry {
 	blkptr_t le_bp;
 	uint32_t le_refcnt;
 	avl_node_t le_node;
 } livelist_entry_t;
 
 static int
 livelist_compare(const void *larg, const void *rarg)
 {
 	const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp;
 	const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp;
 
 	/* Sort them according to dva[0] */
 	uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
 	uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
 
 	if (l_dva0_vdev != r_dva0_vdev)
 		return (TREE_CMP(l_dva0_vdev, r_dva0_vdev));
 
 	/* if vdevs are equal, sort by offsets. */
 	uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
 	uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
 	return (TREE_CMP(l_dva0_offset, r_dva0_offset));
 }
 
 struct livelist_iter_arg {
 	avl_tree_t *avl;
 	bplist_t *to_free;
 	zthr_t *t;
 };
 
 /*
  * Expects an AVL tree which is incrementally filled will FREE blkptrs
  * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
  * corresponding FREE are stored in the supplied bplist.
  *
  * Note that multiple FREE and ALLOC entries for the same blkptr may be
  * encountered when dedup or block cloning is involved.  For this reason we
  * keep a refcount for all the FREE entries of each blkptr and ensure that
  * each of those FREE entries has a corresponding ALLOC preceding it.
  */
 static int
 dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	struct livelist_iter_arg *lia = arg;
 	avl_tree_t *avl = lia->avl;
 	bplist_t *to_free = lia->to_free;
 	zthr_t *t = lia->t;
 	ASSERT(tx == NULL);
 
 	if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t)))
 		return (SET_ERROR(EINTR));
 
 	livelist_entry_t node;
 	node.le_bp = *bp;
 	livelist_entry_t *found = avl_find(avl, &node, NULL);
 	if (found) {
 		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp));
 		ASSERT3U(BP_GET_CHECKSUM(bp), ==,
 		    BP_GET_CHECKSUM(&found->le_bp));
-		ASSERT3U(BP_PHYSICAL_BIRTH(bp), ==,
-		    BP_PHYSICAL_BIRTH(&found->le_bp));
+		ASSERT3U(BP_GET_BIRTH(bp), ==, BP_GET_BIRTH(&found->le_bp));
 	}
 	if (bp_freed) {
 		if (found == NULL) {
 			/* first free entry for this blkptr */
 			livelist_entry_t *e =
 			    kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP);
 			e->le_bp = *bp;
 			e->le_refcnt = 1;
 			avl_add(avl, e);
 		} else {
 			/*
 			 * Deduped or cloned block free.  We could assert D bit
 			 * for dedup, but there is no such one for cloning.
 			 */
 			ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt);
 			found->le_refcnt++;
 		}
 	} else {
 		if (found == NULL) {
 			/* block is currently marked as allocated */
 			bplist_append(to_free, bp);
 		} else {
 			/* alloc matches a free entry */
 			ASSERT3U(found->le_refcnt, !=, 0);
 			found->le_refcnt--;
 			if (found->le_refcnt == 0) {
 				/* all tracked free pairs have been matched */
 				avl_remove(avl, found);
 				kmem_free(found, sizeof (livelist_entry_t));
 			}
 		}
 	}
 	return (0);
 }
 
 /*
  * Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs
  * which have an ALLOC entry but no matching FREE
  */
 int
 dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,
     uint64_t *size)
 {
 	avl_tree_t avl;
 	avl_create(&avl, livelist_compare, sizeof (livelist_entry_t),
 	    offsetof(livelist_entry_t, le_node));
 
 	/* process the sublist */
 	struct livelist_iter_arg arg = {
 	    .avl = &avl,
 	    .to_free = to_free,
 	    .t = t
 	};
 	int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);
 	VERIFY(err != 0 || avl_numnodes(&avl) == 0);
 
 	void *cookie = NULL;
 	livelist_entry_t *le = NULL;
 	while ((le = avl_destroy_nodes(&avl, &cookie)) != NULL) {
 		kmem_free(le, sizeof (livelist_entry_t));
 	}
 	avl_destroy(&avl);
 	return (err);
 }
 
 ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, U64, ZMOD_RW,
 	"Size to start the next sub-livelist in a livelist");
 
 ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW,
 	"Threshold at which livelist is disabled");
diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c
index d9d88a981e05..d4a6e5b6e9fd 100644
--- a/module/zfs/dsl_destroy.c
+++ b/module/zfs/dsl_destroy.c
@@ -1,1301 +1,1304 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dsl_userhold.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dir.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_scan.h>
 #include <sys/dmu_objset.h>
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dmu_impl.h>
 #include <sys/zvol.h>
 #include <sys/zcp.h>
 #include <sys/dsl_deadlist.h>
 #include <sys/zthr.h>
 #include <sys/spa_impl.h>
 
 extern int zfs_snapshot_history_enabled;
 
 int
 dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
 {
 	if (!ds->ds_is_snapshot)
 		return (SET_ERROR(EINVAL));
 
 	if (dsl_dataset_long_held(ds))
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * Only allow deferred destroy on pools that support it.
 	 * NOTE: deferred destroy is only supported on snapshots.
 	 */
 	if (defer) {
 		if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
 		    SPA_VERSION_USERREFS)
 			return (SET_ERROR(ENOTSUP));
 		return (0);
 	}
 
 	/*
 	 * If this snapshot has an elevated user reference count,
 	 * we can't destroy it yet.
 	 */
 	if (ds->ds_userrefs > 0)
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * Can't delete a branch point.
 	 */
 	if (dsl_dataset_phys(ds)->ds_num_children > 1)
 		return (SET_ERROR(EEXIST));
 
 	return (0);
 }
 
 int
 dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_destroy_snapshot_arg_t *ddsa = arg;
 	const char *dsname = ddsa->ddsa_name;
 	boolean_t defer = ddsa->ddsa_defer;
 
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	int error = 0;
 	dsl_dataset_t *ds;
 
 	error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
 
 	/*
 	 * If the snapshot does not exist, silently ignore it, and
 	 * dsl_destroy_snapshot_sync() will be a no-op
 	 * (it's "already destroyed").
 	 */
 	if (error == ENOENT)
 		return (0);
 
 	if (error == 0) {
 		error = dsl_destroy_snapshot_check_impl(ds, defer);
 		dsl_dataset_rele(ds, FTAG);
 	}
 
 	return (error);
 }
 
 struct process_old_arg {
 	dsl_dataset_t *ds;
 	dsl_dataset_t *ds_prev;
 	boolean_t after_branch_point;
 	zio_t *pio;
 	uint64_t used, comp, uncomp;
 };
 
 static int
 process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 {
 	struct process_old_arg *poa = arg;
 	dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
 
 	ASSERT(!BP_IS_HOLE(bp));
 
-	if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
+	if (BP_GET_LOGICAL_BIRTH(bp) <=
+	    dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
 		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx);
 		if (poa->ds_prev && !poa->after_branch_point &&
-		    bp->blk_birth >
+		    BP_GET_LOGICAL_BIRTH(bp) >
 		    dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
 			dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
 			    bp_get_dsize_sync(dp->dp_spa, bp);
 		}
 	} else {
 		poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
 		poa->comp += BP_GET_PSIZE(bp);
 		poa->uncomp += BP_GET_UCSIZE(bp);
 		dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
 	}
 	return (0);
 }
 
 static void
 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
     dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
 {
 	struct process_old_arg poa = { 0 };
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t deadlist_obj;
 
 	ASSERT(ds->ds_deadlist.dl_oldfmt);
 	ASSERT(ds_next->ds_deadlist.dl_oldfmt);
 
 	poa.ds = ds;
 	poa.ds_prev = ds_prev;
 	poa.after_branch_point = after_branch_point;
 	poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
 	    process_old_cb, &poa, tx));
 	VERIFY0(zio_wait(poa.pio));
 	ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes);
 
 	/* change snapused */
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
 	    -poa.used, -poa.comp, -poa.uncomp, tx);
 
 	/* swap next's deadlist to our deadlist */
 	dsl_deadlist_close(&ds->ds_deadlist);
 	dsl_deadlist_close(&ds_next->ds_deadlist);
 	deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
 	dsl_dataset_phys(ds)->ds_deadlist_obj =
 	    dsl_dataset_phys(ds_next)->ds_deadlist_obj;
 	dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj;
 	dsl_deadlist_open(&ds->ds_deadlist, mos,
 	    dsl_dataset_phys(ds)->ds_deadlist_obj);
 	dsl_deadlist_open(&ds_next->ds_deadlist, mos,
 	    dsl_dataset_phys(ds_next)->ds_deadlist_obj);
 }
 
 typedef struct remaining_clones_key {
 	dsl_dataset_t *rck_clone;
 	list_node_t rck_node;
 } remaining_clones_key_t;
 
 static remaining_clones_key_t *
 rck_alloc(dsl_dataset_t *clone)
 {
 	remaining_clones_key_t *rck = kmem_alloc(sizeof (*rck), KM_SLEEP);
 	rck->rck_clone = clone;
 	return (rck);
 }
 
 static void
 dsl_dir_remove_clones_key_impl(dsl_dir_t *dd, uint64_t mintxg, dmu_tx_t *tx,
     list_t *stack, const void *tag)
 {
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 
 	/*
 	 * If it is the old version, dd_clones doesn't exist so we can't
 	 * find the clones, but dsl_deadlist_remove_key() is a no-op so it
 	 * doesn't matter.
 	 */
 	if (dsl_dir_phys(dd)->dd_clones == 0)
 		return;
 
 	zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
 	zap_attribute_t *za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
 	for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones);
 	    zap_cursor_retrieve(zc, za) == 0;
 	    zap_cursor_advance(zc)) {
 		dsl_dataset_t *clone;
 
 		VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
 		    za->za_first_integer, tag, &clone));
 
 		if (clone->ds_dir->dd_origin_txg > mintxg) {
 			dsl_deadlist_remove_key(&clone->ds_deadlist,
 			    mintxg, tx);
 
 			if (dsl_dataset_remap_deadlist_exists(clone)) {
 				dsl_deadlist_remove_key(
 				    &clone->ds_remap_deadlist, mintxg, tx);
 			}
 
 			list_insert_head(stack, rck_alloc(clone));
 		} else {
 			dsl_dataset_rele(clone, tag);
 		}
 	}
 	zap_cursor_fini(zc);
 
 	kmem_free(za, sizeof (zap_attribute_t));
 	kmem_free(zc, sizeof (zap_cursor_t));
 }
 
 void
 dsl_dir_remove_clones_key(dsl_dir_t *top_dd, uint64_t mintxg, dmu_tx_t *tx)
 {
 	list_t stack;
 
 	list_create(&stack, sizeof (remaining_clones_key_t),
 	    offsetof(remaining_clones_key_t, rck_node));
 
 	dsl_dir_remove_clones_key_impl(top_dd, mintxg, tx, &stack, FTAG);
 	for (remaining_clones_key_t *rck = list_remove_head(&stack);
 	    rck != NULL; rck = list_remove_head(&stack)) {
 		dsl_dataset_t *clone = rck->rck_clone;
 		dsl_dir_t *clone_dir = clone->ds_dir;
 
 		kmem_free(rck, sizeof (*rck));
 
 		dsl_dir_remove_clones_key_impl(clone_dir, mintxg, tx,
 		    &stack, FTAG);
 		dsl_dataset_rele(clone, FTAG);
 	}
 
 	list_destroy(&stack);
 }
 
 static void
 dsl_destroy_snapshot_handle_remaps(dsl_dataset_t *ds, dsl_dataset_t *ds_next,
     dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
 	/* Move blocks to be obsoleted to pool's obsolete list. */
 	if (dsl_dataset_remap_deadlist_exists(ds_next)) {
 		if (!bpobj_is_open(&dp->dp_obsolete_bpobj))
 			dsl_pool_create_obsolete_bpobj(dp, tx);
 
 		dsl_deadlist_move_bpobj(&ds_next->ds_remap_deadlist,
 		    &dp->dp_obsolete_bpobj,
 		    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
 	}
 
 	/* Merge our deadlist into next's and free it. */
 	if (dsl_dataset_remap_deadlist_exists(ds)) {
 		uint64_t remap_deadlist_object =
 		    dsl_dataset_get_remap_deadlist_object(ds);
 		ASSERT(remap_deadlist_object != 0);
 
 		mutex_enter(&ds_next->ds_remap_deadlist_lock);
 		if (!dsl_dataset_remap_deadlist_exists(ds_next))
 			dsl_dataset_create_remap_deadlist(ds_next, tx);
 		mutex_exit(&ds_next->ds_remap_deadlist_lock);
 
 		dsl_deadlist_merge(&ds_next->ds_remap_deadlist,
 		    remap_deadlist_object, tx);
 		dsl_dataset_destroy_remap_deadlist(ds, tx);
 	}
 }
 
 void
 dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
 {
 	int after_branch_point = FALSE;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 	dsl_dataset_t *ds_prev = NULL;
 	uint64_t obj;
 
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=,
+	    tx->tx_txg);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	ASSERT(zfs_refcount_is_zero(&ds->ds_longholds));
 
 	if (defer &&
 	    (ds->ds_userrefs > 0 ||
 	    dsl_dataset_phys(ds)->ds_num_children > 1)) {
 		ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY;
 		if (zfs_snapshot_history_enabled) {
 			spa_history_log_internal_ds(ds, "defer_destroy", tx,
 			    " ");
 		}
 		return;
 	}
 
 	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
 
 	if (zfs_snapshot_history_enabled) {
 		/* We need to log before removing it from the namespace. */
 		spa_history_log_internal_ds(ds, "destroy", tx, " ");
 	}
 
 	dsl_scan_ds_destroyed(ds, tx);
 
 	obj = ds->ds_object;
 
 	boolean_t book_exists = dsl_bookmark_ds_destroyed(ds, tx);
 
 	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 		if (dsl_dataset_feature_is_active(ds, f))
 			dsl_dataset_deactivate_feature(ds, f, tx);
 	}
 	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		ASSERT3P(ds->ds_prev, ==, NULL);
 		VERIFY0(dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev));
 		after_branch_point =
 		    (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj);
 
 		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
 		if (after_branch_point &&
 		    dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) {
 			dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
 			if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
 				VERIFY0(zap_add_int(mos,
 				    dsl_dataset_phys(ds_prev)->
 				    ds_next_clones_obj,
 				    dsl_dataset_phys(ds)->ds_next_snap_obj,
 				    tx));
 			}
 		}
 		if (!after_branch_point) {
 			dsl_dataset_phys(ds_prev)->ds_next_snap_obj =
 			    dsl_dataset_phys(ds)->ds_next_snap_obj;
 		}
 	}
 
 	dsl_dataset_t *ds_next;
 	uint64_t old_unique;
 	uint64_t used = 0, comp = 0, uncomp = 0;
 
 	VERIFY0(dsl_dataset_hold_obj(dp,
 	    dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next));
 	ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj);
 
 	old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes;
 
 	dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
 	dsl_dataset_phys(ds_next)->ds_prev_snap_obj =
 	    dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	dsl_dataset_phys(ds_next)->ds_prev_snap_txg =
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg;
 	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
 	    ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0);
 
 	if (ds_next->ds_deadlist.dl_oldfmt) {
 		process_old_deadlist(ds, ds_prev, ds_next,
 		    after_branch_point, tx);
 	} else {
 		/* Adjust prev's unique space. */
 		if (ds_prev && !after_branch_point) {
 			dsl_deadlist_space_range(&ds_next->ds_deadlist,
 			    dsl_dataset_phys(ds_prev)->ds_prev_snap_txg,
 			    dsl_dataset_phys(ds)->ds_prev_snap_txg,
 			    &used, &comp, &uncomp);
 			dsl_dataset_phys(ds_prev)->ds_unique_bytes += used;
 		}
 
 		/* Adjust snapused. */
 		dsl_deadlist_space_range(&ds_next->ds_deadlist,
 		    dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX,
 		    &used, &comp, &uncomp);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
 		    -used, -comp, -uncomp, tx);
 
 		/* Move blocks to be freed to pool's free list. */
 		dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
 		    &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg,
 		    tx);
 		dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
 		    DD_USED_HEAD, used, comp, uncomp, tx);
 
 		/* Merge our deadlist into next's and free it. */
 		dsl_deadlist_merge(&ds_next->ds_deadlist,
 		    dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
 
 		/*
 		 * We are done with the deadlist tree (generated/used
 		 * by dsl_deadlist_move_bpobj() and dsl_deadlist_merge()).
 		 * Discard it to save memory.
 		 */
 		dsl_deadlist_discard_tree(&ds_next->ds_deadlist);
 	}
 
 	dsl_deadlist_close(&ds->ds_deadlist);
 	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
 
 	dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx);
 
 	if (!book_exists) {
 		/* Collapse range in clone heads */
 		dsl_dir_remove_clones_key(ds->ds_dir,
 		    dsl_dataset_phys(ds)->ds_creation_txg, tx);
 	}
 
 	if (ds_next->ds_is_snapshot) {
 		dsl_dataset_t *ds_nextnext;
 
 		/*
 		 * Update next's unique to include blocks which
 		 * were previously shared by only this snapshot
 		 * and it.  Those blocks will be born after the
 		 * prev snap and before this snap, and will have
 		 * died after the next snap and before the one
 		 * after that (ie. be on the snap after next's
 		 * deadlist).
 		 */
 		VERIFY0(dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(ds_next)->ds_next_snap_obj,
 		    FTAG, &ds_nextnext));
 		dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
 		    dsl_dataset_phys(ds)->ds_prev_snap_txg,
 		    dsl_dataset_phys(ds)->ds_creation_txg,
 		    &used, &comp, &uncomp);
 		dsl_dataset_phys(ds_next)->ds_unique_bytes += used;
 		dsl_dataset_rele(ds_nextnext, FTAG);
 		ASSERT3P(ds_next->ds_prev, ==, NULL);
 
 		/* Collapse range in this head. */
 		dsl_dataset_t *hds;
 		VERIFY0(dsl_dataset_hold_obj(dp,
 		    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
 		    FTAG, &hds));
 		if (!book_exists) {
 			/* Collapse range in this head. */
 			dsl_deadlist_remove_key(&hds->ds_deadlist,
 			    dsl_dataset_phys(ds)->ds_creation_txg, tx);
 		}
 		if (dsl_dataset_remap_deadlist_exists(hds)) {
 			dsl_deadlist_remove_key(&hds->ds_remap_deadlist,
 			    dsl_dataset_phys(ds)->ds_creation_txg, tx);
 		}
 		dsl_dataset_rele(hds, FTAG);
 
 	} else {
 		ASSERT3P(ds_next->ds_prev, ==, ds);
 		dsl_dataset_rele(ds_next->ds_prev, ds_next);
 		ds_next->ds_prev = NULL;
 		if (ds_prev) {
 			VERIFY0(dsl_dataset_hold_obj(dp,
 			    dsl_dataset_phys(ds)->ds_prev_snap_obj,
 			    ds_next, &ds_next->ds_prev));
 		}
 
 		dsl_dataset_recalc_head_uniq(ds_next);
 
 		/*
 		 * Reduce the amount of our unconsumed refreservation
 		 * being charged to our parent by the amount of
 		 * new unique data we have gained.
 		 */
 		if (old_unique < ds_next->ds_reserved) {
 			int64_t mrsdelta;
 			uint64_t new_unique =
 			    dsl_dataset_phys(ds_next)->ds_unique_bytes;
 
 			ASSERT(old_unique <= new_unique);
 			mrsdelta = MIN(new_unique - old_unique,
 			    ds_next->ds_reserved - old_unique);
 			dsl_dir_diduse_space(ds->ds_dir,
 			    DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
 		}
 	}
 	dsl_dataset_rele(ds_next, FTAG);
 
 	/*
 	 * This must be done after the dsl_traverse(), because it will
 	 * re-open the objset.
 	 */
 	if (ds->ds_objset) {
 		dmu_objset_evict(ds->ds_objset);
 		ds->ds_objset = NULL;
 	}
 
 	/* remove from snapshot namespace */
 	dsl_dataset_t *ds_head;
 	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0);
 	VERIFY0(dsl_dataset_hold_obj(dp,
 	    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head));
 	VERIFY0(dsl_dataset_get_snapname(ds));
 #ifdef ZFS_DEBUG
 	{
 		uint64_t val;
 		int err;
 
 		err = dsl_dataset_snap_lookup(ds_head,
 		    ds->ds_snapname, &val);
 		ASSERT0(err);
 		ASSERT3U(val, ==, obj);
 	}
 #endif
 	VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
 	dsl_dataset_rele(ds_head, FTAG);
 
 	if (ds_prev != NULL)
 		dsl_dataset_rele(ds_prev, FTAG);
 
 	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
 
 	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
 		uint64_t count __maybe_unused;
 		ASSERT0(zap_count(mos,
 		    dsl_dataset_phys(ds)->ds_next_clones_obj, &count) &&
 		    count == 0);
 		VERIFY0(dmu_object_free(mos,
 		    dsl_dataset_phys(ds)->ds_next_clones_obj, tx));
 	}
 	if (dsl_dataset_phys(ds)->ds_props_obj != 0)
 		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj,
 		    tx));
 	if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
 		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
 		    tx));
 	dsl_dir_rele(ds->ds_dir, ds);
 	ds->ds_dir = NULL;
 	dmu_object_free_zapified(mos, obj, tx);
 }
 
 void
 dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_destroy_snapshot_arg_t *ddsa = arg;
 	const char *dsname = ddsa->ddsa_name;
 	boolean_t defer = ddsa->ddsa_defer;
 
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 
 	int error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
 	if (error == ENOENT)
 		return;
 	ASSERT0(error);
 	dsl_destroy_snapshot_sync_impl(ds, defer, tx);
 	zvol_remove_minors(dp->dp_spa, dsname, B_TRUE);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 /*
  * The semantics of this function are described in the comment above
  * lzc_destroy_snaps().  To summarize:
  *
  * The snapshots must all be in the same pool.
  *
  * Snapshots that don't exist will be silently ignored (considered to be
  * "already deleted").
  *
  * On success, all snaps will be destroyed and this will return 0.
  * On failure, no snaps will be destroyed, the errlist will be filled in,
  * and this will return an errno.
  */
 int
 dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
     nvlist_t *errlist)
 {
 	if (nvlist_next_nvpair(snaps, NULL) == NULL)
 		return (0);
 
 	/*
 	 * lzc_destroy_snaps() is documented to take an nvlist whose
 	 * values "don't matter".  We need to convert that nvlist to
 	 * one that we know can be converted to LUA.
 	 */
 	nvlist_t *snaps_normalized = fnvlist_alloc();
 	for (nvpair_t *pair = nvlist_next_nvpair(snaps, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) {
 		fnvlist_add_boolean_value(snaps_normalized,
 		    nvpair_name(pair), B_TRUE);
 	}
 
 	nvlist_t *arg = fnvlist_alloc();
 	fnvlist_add_nvlist(arg, "snaps", snaps_normalized);
 	fnvlist_free(snaps_normalized);
 	fnvlist_add_boolean_value(arg, "defer", defer);
 
 	nvlist_t *wrapper = fnvlist_alloc();
 	fnvlist_add_nvlist(wrapper, ZCP_ARG_ARGLIST, arg);
 	fnvlist_free(arg);
 
 	const char *program =
 	    "arg = ...\n"
 	    "snaps = arg['snaps']\n"
 	    "defer = arg['defer']\n"
 	    "errors = { }\n"
 	    "has_errors = false\n"
 	    "for snap, v in pairs(snaps) do\n"
 	    "    errno = zfs.check.destroy{snap, defer=defer}\n"
 	    "    zfs.debug('snap: ' .. snap .. ' errno: ' .. errno)\n"
 	    "    if errno == ENOENT then\n"
 	    "        snaps[snap] = nil\n"
 	    "    elseif errno ~= 0 then\n"
 	    "        errors[snap] = errno\n"
 	    "        has_errors = true\n"
 	    "    end\n"
 	    "end\n"
 	    "if has_errors then\n"
 	    "    return errors\n"
 	    "end\n"
 	    "for snap, v in pairs(snaps) do\n"
 	    "    errno = zfs.sync.destroy{snap, defer=defer}\n"
 	    "    assert(errno == 0)\n"
 	    "end\n"
 	    "return { }\n";
 
 	nvlist_t *result = fnvlist_alloc();
 	int error = zcp_eval(nvpair_name(nvlist_next_nvpair(snaps, NULL)),
 	    program,
 	    B_TRUE,
 	    0,
 	    zfs_lua_max_memlimit,
 	    fnvlist_lookup_nvpair(wrapper, ZCP_ARG_ARGLIST), result);
 	if (error != 0) {
 		const char *errorstr = NULL;
 		(void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr);
 		if (errorstr != NULL) {
 			zfs_dbgmsg("%s", errorstr);
 		}
 		fnvlist_free(wrapper);
 		fnvlist_free(result);
 		return (error);
 	}
 	fnvlist_free(wrapper);
 
 	/*
 	 * lzc_destroy_snaps() is documented to fill the errlist with
 	 * int32 values, so we need to convert the int64 values that are
 	 * returned from LUA.
 	 */
 	int rv = 0;
 	nvlist_t *errlist_raw = fnvlist_lookup_nvlist(result, ZCP_RET_RETURN);
 	for (nvpair_t *pair = nvlist_next_nvpair(errlist_raw, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(errlist_raw, pair)) {
 		int32_t val = (int32_t)fnvpair_value_int64(pair);
 		if (rv == 0)
 			rv = val;
 		fnvlist_add_int32(errlist, nvpair_name(pair), val);
 	}
 	fnvlist_free(result);
 	return (rv);
 }
 
 int
 dsl_destroy_snapshot(const char *name, boolean_t defer)
 {
 	int error;
 	nvlist_t *nvl = fnvlist_alloc();
 	nvlist_t *errlist = fnvlist_alloc();
 
 	fnvlist_add_boolean(nvl, name);
 	error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
 	fnvlist_free(errlist);
 	fnvlist_free(nvl);
 	return (error);
 }
 
 struct killarg {
 	dsl_dataset_t *ds;
 	dmu_tx_t *tx;
 };
 
 static int
 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	(void) spa, (void) dnp;
 	struct killarg *ka = arg;
 	dmu_tx_t *tx = ka->tx;
 
 	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
 	    BP_IS_EMBEDDED(bp))
 		return (0);
 
 	if (zb->zb_level == ZB_ZIL_LEVEL) {
 		ASSERT(zilog != NULL);
 		/*
 		 * It's a block in the intent log.  It has no
 		 * accounting, so just free it.
 		 */
 		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
 	} else {
 		ASSERT(zilog == NULL);
-		ASSERT3U(bp->blk_birth, >,
+		ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >,
 		    dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
 		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
 	}
 
 	return (0);
 }
 
 static void
 old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	struct killarg ka;
 
 	spa_history_log_internal_ds(ds, "destroy", tx,
 	    "(synchronous, mintxg=%llu)",
 	    (long long)dsl_dataset_phys(ds)->ds_prev_snap_txg);
 
 	/*
 	 * Free everything that we point to (that's born after
 	 * the previous snapshot, if we are a clone)
 	 *
 	 * NB: this should be very quick, because we already
 	 * freed all the objects in open context.
 	 */
 	ka.ds = ds;
 	ka.tx = tx;
 	VERIFY0(traverse_dataset(ds,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST |
 	    TRAVERSE_NO_DECRYPT, kill_blkptr, &ka));
 	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
 	    dsl_dataset_phys(ds)->ds_unique_bytes == 0);
 }
 
 int
 dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
 {
 	int error;
 	uint64_t count;
 	objset_t *mos;
 
 	ASSERT(!ds->ds_is_snapshot);
 	if (ds->ds_is_snapshot)
 		return (SET_ERROR(EINVAL));
 
 	if (zfs_refcount_count(&ds->ds_longholds) != expected_holds)
 		return (SET_ERROR(EBUSY));
 
 	ASSERT0(ds->ds_dir->dd_activity_waiters);
 
 	mos = ds->ds_dir->dd_pool->dp_meta_objset;
 
 	/*
 	 * Can't delete a head dataset if there are snapshots of it.
 	 * (Except if the only snapshots are from the branch we cloned
 	 * from.)
 	 */
 	if (ds->ds_prev != NULL &&
 	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object)
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * Can't delete if there are children of this fs.
 	 */
 	error = zap_count(mos,
 	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count);
 	if (error != 0)
 		return (error);
 	if (count != 0)
 		return (SET_ERROR(EEXIST));
 
 	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
 	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
 	    ds->ds_prev->ds_userrefs == 0) {
 		/* We need to remove the origin snapshot as well. */
 		if (!zfs_refcount_is_zero(&ds->ds_prev->ds_longholds))
 			return (SET_ERROR(EBUSY));
 	}
 	return (0);
 }
 
 int
 dsl_destroy_head_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_destroy_head_arg_t *ddha = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	error = dsl_destroy_head_check_impl(ds, 0);
 	dsl_dataset_rele(ds, FTAG);
 	return (error);
 }
 
 static void
 dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 	dd_used_t t;
 
 	ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));
 
 	VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
 
 	ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);
 
 	/* Decrement the filesystem count for all parent filesystems. */
 	if (dd->dd_parent != NULL)
 		dsl_fs_ss_count_adjust(dd->dd_parent, -1,
 		    DD_FIELD_FILESYSTEM_COUNT, tx);
 
 	/*
 	 * Remove our reservation. The impl() routine avoids setting the
 	 * actual property, which would require the (already destroyed) ds.
 	 */
 	dsl_dir_set_reservation_sync_impl(dd, 0, tx);
 
 	ASSERT0(dsl_dir_phys(dd)->dd_used_bytes);
 	ASSERT0(dsl_dir_phys(dd)->dd_reserved);
 	for (t = 0; t < DD_USED_NUM; t++)
 		ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
 
 	if (dd->dd_crypto_obj != 0) {
 		dsl_crypto_key_destroy_sync(dd->dd_crypto_obj, tx);
 		(void) spa_keystore_unload_wkey_impl(dp->dp_spa, dd->dd_object);
 	}
 
 	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
 	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
 	if (dsl_dir_phys(dd)->dd_clones != 0)
 		VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_clones, tx));
 	VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
 	VERIFY0(zap_remove(mos,
 	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
 	    dd->dd_myname, tx));
 
 	dsl_dir_rele(dd, FTAG);
 	dmu_object_free_zapified(mos, ddobj, tx);
 }
 
 static void
 dsl_clone_destroy_assert(dsl_dir_t *dd)
 {
 	uint64_t used, comp, uncomp;
 
 	ASSERT(dsl_dir_is_clone(dd));
 	dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp);
 
 	ASSERT3U(dsl_dir_phys(dd)->dd_used_bytes, ==, used);
 	ASSERT3U(dsl_dir_phys(dd)->dd_compressed_bytes, ==, comp);
 	/*
 	 * Greater than because we do not track embedded block pointers in
 	 * the livelist
 	 */
 	ASSERT3U(dsl_dir_phys(dd)->dd_uncompressed_bytes, >=, uncomp);
 
 	ASSERT(list_is_empty(&dd->dd_pending_allocs.bpl_list));
 	ASSERT(list_is_empty(&dd->dd_pending_frees.bpl_list));
 }
 
 /*
  * Start the delete process for a clone. Free its zil, verify the space usage
  * and queue the blkptrs for deletion by adding the livelist to the pool-wide
  * delete queue.
  */
 static void
 dsl_async_clone_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	uint64_t zap_obj, to_delete, used, comp, uncomp;
 	objset_t *os;
 	dsl_dir_t *dd = ds->ds_dir;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	VERIFY0(dmu_objset_from_ds(ds, &os));
 
 	uint64_t mintxg = 0;
 	dsl_deadlist_entry_t *dle = dsl_deadlist_first(&dd->dd_livelist);
 	if (dle != NULL)
 		mintxg = dle->dle_mintxg;
 
 	spa_history_log_internal_ds(ds, "destroy", tx,
 	    "(livelist, mintxg=%llu)", (long long)mintxg);
 
 	/* Check that the clone is in a correct state to be deleted */
 	dsl_clone_destroy_assert(dd);
 
 	/* Destroy the zil */
 	zil_destroy_sync(dmu_objset_zil(os), tx);
 
 	VERIFY0(zap_lookup(mos, dd->dd_object,
 	    DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &to_delete));
 	/* Initialize deleted_clones entry to track livelists to cleanup */
 	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
 	if (error == ENOENT) {
 		zap_obj = zap_create(mos, DMU_OTN_ZAP_METADATA,
 		    DMU_OT_NONE, 0, tx);
 		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1,
 		    &(zap_obj), tx));
 		spa->spa_livelists_to_delete = zap_obj;
 	} else if (error != 0) {
 		zfs_panic_recover("zfs: error %d was returned while looking "
 		    "up DMU_POOL_DELETED_CLONES in the zap", error);
 		return;
 	}
 	VERIFY0(zap_add_int(mos, zap_obj, to_delete, tx));
 
 	/* Clone is no longer using space, now tracked by dp_free_dir */
 	dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp);
 	dsl_dir_diduse_space(dd, DD_USED_HEAD,
 	    -used, -comp, -dsl_dir_phys(dd)->dd_uncompressed_bytes,
 	    tx);
 	dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
 	    used, comp, uncomp, tx);
 	dsl_dir_remove_livelist(dd, tx, B_FALSE);
 	zthr_wakeup(spa->spa_livelist_delete_zthr);
 }
 
 /*
  * Move the bptree into the pool's list of trees to clean up, update space
  * accounting information and destroy the zil.
  */
 static void
 dsl_async_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	uint64_t used, comp, uncomp;
 	objset_t *os;
 
 	VERIFY0(dmu_objset_from_ds(ds, &os));
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 
 	spa_history_log_internal_ds(ds, "destroy", tx,
 	    "(bptree, mintxg=%llu)",
 	    (long long)dsl_dataset_phys(ds)->ds_prev_snap_txg);
 
 	zil_destroy_sync(dmu_objset_zil(os), tx);
 
 	if (!spa_feature_is_active(dp->dp_spa,
 	    SPA_FEATURE_ASYNC_DESTROY)) {
 		dsl_scan_t *scn = dp->dp_scan;
 		spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
 		    tx);
 		dp->dp_bptree_obj = bptree_alloc(mos, tx);
 		VERIFY0(zap_add(mos,
 		    DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 		    &dp->dp_bptree_obj, tx));
 		ASSERT(!scn->scn_async_destroying);
 		scn->scn_async_destroying = B_TRUE;
 	}
 
 	used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
 	comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
 	uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
 
 	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
 	    dsl_dataset_phys(ds)->ds_unique_bytes == used);
 
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	bptree_add(mos, dp->dp_bptree_obj,
 	    &dsl_dataset_phys(ds)->ds_bp,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg,
 	    used, comp, uncomp, tx);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
 	    -used, -comp, -uncomp, tx);
 	dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
 	    used, comp, uncomp, tx);
 }
 
 void
 dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t obj, ddobj, prevobj = 0;
 	boolean_t rmorigin;
 
 	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
 	ASSERT(ds->ds_prev == NULL ||
 	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=,
+	    tx->tx_txg);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
 	dsl_dir_cancel_waiters(ds->ds_dir);
 
 	rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
 	    DS_IS_DEFER_DESTROY(ds->ds_prev) &&
 	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
 	    ds->ds_prev->ds_userrefs == 0);
 
 	/* Remove our reservation. */
 	if (ds->ds_reserved != 0) {
 		dsl_dataset_set_refreservation_sync_impl(ds,
 		    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
 		    0, tx);
 		ASSERT0(ds->ds_reserved);
 	}
 
 	obj = ds->ds_object;
 
 	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
 		if (dsl_dataset_feature_is_active(ds, f))
 			dsl_dataset_deactivate_feature(ds, f, tx);
 	}
 
 	dsl_scan_ds_destroyed(ds, tx);
 
 	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		/* This is a clone */
 		ASSERT(ds->ds_prev != NULL);
 		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=,
 		    obj);
 		ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);
 
 		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 		if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) {
 			dsl_dataset_remove_from_next_clones(ds->ds_prev,
 			    obj, tx);
 		}
 
 		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1);
 		dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
 	}
 
 	/*
 	 * Destroy the deadlist. Unless it's a clone, the
 	 * deadlist should be empty since the dataset has no snapshots.
 	 * (If it's a clone, it's safe to ignore the deadlist contents
 	 * since they are still referenced by the origin snapshot.)
 	 */
 	dsl_deadlist_close(&ds->ds_deadlist);
 	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
 
 	if (dsl_dataset_remap_deadlist_exists(ds))
 		dsl_dataset_destroy_remap_deadlist(ds, tx);
 
 	/*
 	 * Each destroy is responsible for both destroying (enqueuing
 	 * to be destroyed) the blkptrs comprising the dataset as well as
 	 * those belonging to the zil.
 	 */
 	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) {
 		dsl_async_clone_destroy(ds, tx);
 	} else if (spa_feature_is_enabled(dp->dp_spa,
 	    SPA_FEATURE_ASYNC_DESTROY)) {
 		dsl_async_dataset_destroy(ds, tx);
 	} else {
 		old_synchronous_dataset_destroy(ds, tx);
 	}
 
 	if (ds->ds_prev != NULL) {
 		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 			VERIFY0(zap_remove_int(mos,
 			    dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones,
 			    ds->ds_object, tx));
 		}
 		prevobj = ds->ds_prev->ds_object;
 		dsl_dataset_rele(ds->ds_prev, ds);
 		ds->ds_prev = NULL;
 	}
 
 	/*
 	 * This must be done after the dsl_traverse(), because it will
 	 * re-open the objset.
 	 */
 	if (ds->ds_objset) {
 		dmu_objset_evict(ds->ds_objset);
 		ds->ds_objset = NULL;
 	}
 
 	/* Erase the link in the dir */
 	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 	dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0;
 	ddobj = ds->ds_dir->dd_object;
 	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0);
 	VERIFY0(zap_destroy(mos,
 	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx));
 
 	if (ds->ds_bookmarks_obj != 0) {
 		void *cookie = NULL;
 		dsl_bookmark_node_t *dbn;
 
 		while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) !=
 		    NULL) {
 			if (dbn->dbn_phys.zbm_redaction_obj != 0) {
 				dnode_t *rl;
 				VERIFY0(dnode_hold(mos,
 				    dbn->dbn_phys.zbm_redaction_obj, FTAG,
 				    &rl));
 				if (rl->dn_have_spill) {
 					spa_feature_decr(dmu_objset_spa(mos),
 					    SPA_FEATURE_REDACTION_LIST_SPILL,
 					    tx);
 				}
 				dnode_rele(rl, FTAG);
 				VERIFY0(dmu_object_free(mos,
 				    dbn->dbn_phys.zbm_redaction_obj, tx));
 				spa_feature_decr(dmu_objset_spa(mos),
 				    SPA_FEATURE_REDACTION_BOOKMARKS, tx);
 			}
 			if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) {
 				spa_feature_decr(dmu_objset_spa(mos),
 				    SPA_FEATURE_BOOKMARK_WRITTEN, tx);
 			}
 			spa_strfree(dbn->dbn_name);
 			mutex_destroy(&dbn->dbn_lock);
 			kmem_free(dbn, sizeof (*dbn));
 		}
 		avl_destroy(&ds->ds_bookmarks);
 		VERIFY0(zap_destroy(mos, ds->ds_bookmarks_obj, tx));
 		spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
 	}
 
 	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
 
 	ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj);
 	ASSERT0(dsl_dataset_phys(ds)->ds_props_obj);
 	ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj);
 	dsl_dir_rele(ds->ds_dir, ds);
 	ds->ds_dir = NULL;
 	dmu_object_free_zapified(mos, obj, tx);
 
 	dsl_dir_destroy_sync(ddobj, tx);
 
 	if (rmorigin) {
 		dsl_dataset_t *prev;
 		VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
 		dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
 		dsl_dataset_rele(prev, FTAG);
 	}
 	/* Delete errlog. */
 	if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG))
 		spa_delete_dataset_errlog(dp->dp_spa, ds->ds_object, tx);
 }
 
 void
 dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_destroy_head_arg_t *ddha = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 
 	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
 	dsl_destroy_head_sync_impl(ds, tx);
 	zvol_remove_minors(dp->dp_spa, ddha->ddha_name, B_TRUE);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 static void
 dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_destroy_head_arg_t *ddha = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 
 	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
 
 	/* Mark it as inconsistent on-disk, in case we crash */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
 
 	spa_history_log_internal_ds(ds, "destroy begin", tx, " ");
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_destroy_head(const char *name)
 {
 	dsl_destroy_head_arg_t ddha;
 	int error;
 	spa_t *spa;
 	boolean_t isenabled;
 
 #ifdef _KERNEL
 	zfs_destroy_unmount_origin(name);
 #endif
 
 	error = spa_open(name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY);
 	spa_close(spa, FTAG);
 
 	ddha.ddha_name = name;
 
 	if (!isenabled) {
 		objset_t *os;
 
 		error = dsl_sync_task(name, dsl_destroy_head_check,
 		    dsl_destroy_head_begin_sync, &ddha,
 		    0, ZFS_SPACE_CHECK_DESTROY);
 		if (error != 0)
 			return (error);
 
 		/*
 		 * Head deletion is processed in one txg on old pools;
 		 * remove the objects from open context so that the txg sync
 		 * is not too long. This optimization can only work for
 		 * encrypted datasets if the wrapping key is loaded.
 		 */
 		error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, B_TRUE,
 		    FTAG, &os);
 		if (error == 0) {
 			uint64_t prev_snap_txg =
 			    dsl_dataset_phys(dmu_objset_ds(os))->
 			    ds_prev_snap_txg;
 			for (uint64_t obj = 0; error == 0;
 			    error = dmu_object_next(os, &obj, FALSE,
 			    prev_snap_txg))
 				(void) dmu_free_long_object(os, obj);
 			/* sync out all frees */
 			txg_wait_synced(dmu_objset_pool(os), 0);
 			dmu_objset_disown(os, B_TRUE, FTAG);
 		}
 	}
 
 	return (dsl_sync_task(name, dsl_destroy_head_check,
 	    dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_DESTROY));
 }
 
 /*
  * Note, this function is used as the callback for dmu_objset_find().  We
  * always return 0 so that we will continue to find and process
  * inconsistent datasets, even if we encounter an error trying to
  * process one of them.
  */
 int
 dsl_destroy_inconsistent(const char *dsname, void *arg)
 {
 	(void) arg;
 	objset_t *os;
 
 	if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
 		boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));
 
 		/*
 		 * If the dataset is inconsistent because a resumable receive
 		 * has failed, then do not destroy it.
 		 */
 		if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
 			need_destroy = B_FALSE;
 
 		dmu_objset_rele(os, FTAG);
 		if (need_destroy)
 			(void) dsl_destroy_head(dsname);
 	}
 	return (0);
 }
 
 
 #if defined(_KERNEL)
 EXPORT_SYMBOL(dsl_destroy_head);
 EXPORT_SYMBOL(dsl_destroy_head_sync_impl);
 EXPORT_SYMBOL(dsl_dataset_user_hold_check_one);
 EXPORT_SYMBOL(dsl_destroy_snapshot_sync_impl);
 EXPORT_SYMBOL(dsl_destroy_inconsistent);
 EXPORT_SYMBOL(dsl_dataset_user_release_tmp);
 EXPORT_SYMBOL(dsl_destroy_head_check_impl);
 #endif
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 370c6a010dca..342ec5c15c79 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -1,1485 +1,1485 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_scan.h>
 #include <sys/dnode.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/zfs_context.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/bptree.h>
 #include <sys/zfeature.h>
 #include <sys/zil_impl.h>
 #include <sys/dsl_userhold.h>
 #include <sys/trace_zfs.h>
 #include <sys/mmp.h>
 
 /*
  * ZFS Write Throttle
  * ------------------
  *
  * ZFS must limit the rate of incoming writes to the rate at which it is able
  * to sync data modifications to the backend storage. Throttling by too much
  * creates an artificial limit; throttling by too little can only be sustained
  * for short periods and would lead to highly lumpy performance. On a per-pool
  * basis, ZFS tracks the amount of modified (dirty) data. As operations change
  * data, the amount of dirty data increases; as ZFS syncs out data, the amount
  * of dirty data decreases. When the amount of dirty data exceeds a
  * predetermined threshold further modifications are blocked until the amount
  * of dirty data decreases (as data is synced out).
  *
  * The limit on dirty data is tunable, and should be adjusted according to
  * both the IO capacity and available memory of the system. The larger the
  * window, the more ZFS is able to aggregate and amortize metadata (and data)
  * changes. However, memory is a limited resource, and allowing for more dirty
  * data comes at the cost of keeping other useful data in memory (for example
  * ZFS data cached by the ARC).
  *
  * Implementation
  *
  * As buffers are modified dsl_pool_willuse_space() increments both the per-
  * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
  * dirty space used; dsl_pool_dirty_space() decrements those values as data
  * is synced out from dsl_pool_sync(). While only the poolwide value is
  * relevant, the per-txg value is useful for debugging. The tunable
  * zfs_dirty_data_max determines the dirty space limit. Once that value is
  * exceeded, new writes are halted until space frees up.
  *
  * The zfs_dirty_data_sync_percent tunable dictates the threshold at which we
  * ensure that there is a txg syncing (see the comment in txg.c for a full
  * description of transaction group stages).
  *
  * The IO scheduler uses both the dirty space limit and current amount of
  * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
  * issues. See the comment in vdev_queue.c for details of the IO scheduler.
  *
  * The delay is also calculated based on the amount of dirty data.  See the
  * comment above dmu_tx_delay() for details.
  */
 
 /*
  * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
  * capped at zfs_dirty_data_max_max.  It can also be overridden with a module
  * parameter.
  */
 uint64_t zfs_dirty_data_max = 0;
 uint64_t zfs_dirty_data_max_max = 0;
 uint_t zfs_dirty_data_max_percent = 10;
 uint_t zfs_dirty_data_max_max_percent = 25;
 
 /*
  * The upper limit of TX_WRITE log data.  Write operations are throttled
  * when approaching the limit until log data is cleared out after txg sync.
  * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY.
  */
 uint64_t zfs_wrlog_data_max = 0;
 
 /*
  * If there's at least this much dirty data (as a percentage of
  * zfs_dirty_data_max), push out a txg.  This should be less than
  * zfs_vdev_async_write_active_min_dirty_percent.
  */
 static uint_t zfs_dirty_data_sync_percent = 20;
 
 /*
  * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
  * and delay each transaction.
  * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
  */
 uint_t zfs_delay_min_dirty_percent = 60;
 
 /*
  * This controls how quickly the delay approaches infinity.
  * Larger values cause it to delay more for a given amount of dirty data.
  * Therefore larger values will cause there to be less dirty data for a
  * given throughput.
  *
  * For the smoothest delay, this value should be about 1 billion divided
  * by the maximum number of operations per second.  This will smoothly
  * handle between 10x and 1/10th this number.
  *
  * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
  * multiply in dmu_tx_delay().
  */
 uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
 
 /*
  * These tunables determine the behavior of how zil_itxg_clean() is
  * called via zil_clean() in the context of spa_sync(). When an itxg
  * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching.
  * If the dispatch fails, the call to zil_itxg_clean() will occur
  * synchronously in the context of spa_sync(), which can negatively
  * impact the performance of spa_sync() (e.g. in the case of the itxg
  * list having a large number of itxs that needs to be cleaned).
  *
  * Thus, these tunables can be used to manipulate the behavior of the
  * taskq used by zil_clean(); they determine the number of taskq entries
  * that are pre-populated when the taskq is first created (via the
  * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of
  * taskq entries that are cached after an on-demand allocation (via the
  * "zfs_zil_clean_taskq_maxalloc").
  *
  * The idea being, we want to try reasonably hard to ensure there will
  * already be a taskq entry pre-allocated by the time that it is needed
  * by zil_clean(). This way, we can avoid the possibility of an
  * on-demand allocation of a new taskq entry from failing, which would
  * result in zil_itxg_clean() being called synchronously from zil_clean()
  * (which can adversely affect performance of spa_sync()).
  *
  * Additionally, the number of threads used by the taskq can be
  * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable.
  */
 static int zfs_zil_clean_taskq_nthr_pct = 100;
 static int zfs_zil_clean_taskq_minalloc = 1024;
 static int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
 
 int
 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
 {
 	uint64_t obj;
 	int err;
 
 	err = zap_lookup(dp->dp_meta_objset,
 	    dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
 	    name, sizeof (obj), 1, &obj);
 	if (err)
 		return (err);
 
 	return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
 }
 
 static dsl_pool_t *
 dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 {
 	dsl_pool_t *dp;
 	blkptr_t *bp = spa_get_rootblkptr(spa);
 
 	dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
 	dp->dp_spa = spa;
 	dp->dp_meta_rootbp = *bp;
 	rrw_init(&dp->dp_config_rwlock, B_TRUE);
 	txg_init(dp, txg);
 	mmp_init(spa);
 
 	txg_list_create(&dp->dp_dirty_datasets, spa,
 	    offsetof(dsl_dataset_t, ds_dirty_link));
 	txg_list_create(&dp->dp_dirty_zilogs, spa,
 	    offsetof(zilog_t, zl_dirty_link));
 	txg_list_create(&dp->dp_dirty_dirs, spa,
 	    offsetof(dsl_dir_t, dd_dirty_link));
 	txg_list_create(&dp->dp_sync_tasks, spa,
 	    offsetof(dsl_sync_task_t, dst_node));
 	txg_list_create(&dp->dp_early_sync_tasks, spa,
 	    offsetof(dsl_sync_task_t, dst_node));
 
 	dp->dp_sync_taskq = spa_sync_tq_create(spa, "dp_sync_taskq");
 
 	dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
 	    zfs_zil_clean_taskq_nthr_pct, minclsyspri,
 	    zfs_zil_clean_taskq_minalloc,
 	    zfs_zil_clean_taskq_maxalloc,
 	    TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
 
 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
 
 	aggsum_init(&dp->dp_wrlog_total, 0);
 	for (int i = 0; i < TXG_SIZE; i++) {
 		aggsum_init(&dp->dp_wrlog_pertxg[i], 0);
 	}
 
 	dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri,
 	    boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
 	    TASKQ_THREADS_CPU_PCT);
 	dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain",
 	    100, defclsyspri, boot_ncpus, INT_MAX,
 	    TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 
 	return (dp);
 }
 
 int
 dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 {
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 
 	/*
 	 * Initialize the caller's dsl_pool_t structure before we actually open
 	 * the meta objset.  This is done because a self-healing write zio may
 	 * be issued as part of dmu_objset_open_impl() and the spa needs its
 	 * dsl_pool_t initialized in order to handle the write.
 	 */
 	*dpp = dp;
 
 	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
 	    &dp->dp_meta_objset);
 	if (err != 0) {
 		dsl_pool_close(dp);
 		*dpp = NULL;
 	}
 
 	return (err);
 }
 
 int
 dsl_pool_open(dsl_pool_t *dp)
 {
 	int err;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	uint64_t obj;
 
 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 	    &dp->dp_root_dir_obj);
 	if (err)
 		goto out;
 
 	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 	    NULL, dp, &dp->dp_root_dir);
 	if (err)
 		goto out;
 
 	err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
 	if (err)
 		goto out;
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
 		err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
 		if (err)
 			goto out;
 		err = dsl_dataset_hold_obj(dp,
 		    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
 		if (err == 0) {
 			err = dsl_dataset_hold_obj(dp,
 			    dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
 			    &dp->dp_origin_snap);
 			dsl_dataset_rele(ds, FTAG);
 		}
 		dsl_dir_rele(dd, dp);
 		if (err)
 			goto out;
 	}
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
 		    &dp->dp_free_dir);
 		if (err)
 			goto out;
 
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
 		if (err)
 			goto out;
 		VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 		    dp->dp_meta_objset, obj));
 	}
 
 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj);
 		if (err == 0) {
 			VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj,
 			    dp->dp_meta_objset, obj));
 		} else if (err == ENOENT) {
 			/*
 			 * We might not have created the remap bpobj yet.
 			 */
 		} else {
 			goto out;
 		}
 	}
 
 	/*
 	 * Note: errors ignored, because the these special dirs, used for
 	 * space accounting, are only created on demand.
 	 */
 	(void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
 	    &dp->dp_leak_dir);
 
 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 		    &dp->dp_bptree_obj);
 		if (err != 0)
 			goto out;
 	}
 
 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
 		    &dp->dp_empty_bpobj);
 		if (err != 0)
 			goto out;
 	}
 
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
 	    &dp->dp_tmp_userrefs_obj);
 	if (err == ENOENT)
 		err = 0;
 	if (err)
 		goto out;
 
 	err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
 
 out:
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 	return (err);
 }
 
 void
 dsl_pool_close(dsl_pool_t *dp)
 {
 	/*
 	 * Drop our references from dsl_pool_open().
 	 *
 	 * Since we held the origin_snap from "syncing" context (which
 	 * includes pool-opening context), it actually only got a "ref"
 	 * and not a hold, so just drop that here.
 	 */
 	if (dp->dp_origin_snap != NULL)
 		dsl_dataset_rele(dp->dp_origin_snap, dp);
 	if (dp->dp_mos_dir != NULL)
 		dsl_dir_rele(dp->dp_mos_dir, dp);
 	if (dp->dp_free_dir != NULL)
 		dsl_dir_rele(dp->dp_free_dir, dp);
 	if (dp->dp_leak_dir != NULL)
 		dsl_dir_rele(dp->dp_leak_dir, dp);
 	if (dp->dp_root_dir != NULL)
 		dsl_dir_rele(dp->dp_root_dir, dp);
 
 	bpobj_close(&dp->dp_free_bpobj);
 	bpobj_close(&dp->dp_obsolete_bpobj);
 
 	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
 	if (dp->dp_meta_objset != NULL)
 		dmu_objset_evict(dp->dp_meta_objset);
 
 	txg_list_destroy(&dp->dp_dirty_datasets);
 	txg_list_destroy(&dp->dp_dirty_zilogs);
 	txg_list_destroy(&dp->dp_sync_tasks);
 	txg_list_destroy(&dp->dp_early_sync_tasks);
 	txg_list_destroy(&dp->dp_dirty_dirs);
 
 	taskq_destroy(dp->dp_zil_clean_taskq);
 	spa_sync_tq_destroy(dp->dp_spa);
 
 	/*
 	 * We can't set retry to TRUE since we're explicitly specifying
 	 * a spa to flush. This is good enough; any missed buffers for
 	 * this spa won't cause trouble, and they'll eventually fall
 	 * out of the ARC just like any other unused buffer.
 	 */
 	arc_flush(dp->dp_spa, FALSE);
 
 	mmp_fini(dp->dp_spa);
 	txg_fini(dp);
 	dsl_scan_fini(dp);
 	dmu_buf_user_evict_wait();
 
 	rrw_destroy(&dp->dp_config_rwlock);
 	mutex_destroy(&dp->dp_lock);
 	cv_destroy(&dp->dp_spaceavail_cv);
 
 	ASSERT0(aggsum_value(&dp->dp_wrlog_total));
 	aggsum_fini(&dp->dp_wrlog_total);
 	for (int i = 0; i < TXG_SIZE; i++) {
 		ASSERT0(aggsum_value(&dp->dp_wrlog_pertxg[i]));
 		aggsum_fini(&dp->dp_wrlog_pertxg[i]);
 	}
 
 	taskq_destroy(dp->dp_unlinked_drain_taskq);
 	taskq_destroy(dp->dp_zrele_taskq);
 	if (dp->dp_blkstats != NULL)
 		vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 	kmem_free(dp, sizeof (dsl_pool_t));
 }
 
 void
 dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	uint64_t obj;
 	/*
 	 * Currently, we only create the obsolete_bpobj where there are
 	 * indirect vdevs with referenced mappings.
 	 */
 	ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL));
 	/* create and open the obsolete_bpobj */
 	obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
 	VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj));
 	VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
 	spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 }
 
 void
 dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 	VERIFY0(zap_remove(dp->dp_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_OBSOLETE_BPOBJ, tx));
 	bpobj_free(dp->dp_meta_objset,
 	    dp->dp_obsolete_bpobj.bpo_object, tx);
 	bpobj_close(&dp->dp_obsolete_bpobj);
 }
 
 dsl_pool_t *
 dsl_pool_create(spa_t *spa, nvlist_t *zplprops __attribute__((unused)),
     dsl_crypto_params_t *dcp, uint64_t txg)
 {
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 #ifdef _KERNEL
 	objset_t *os;
 #else
 	objset_t *os __attribute__((unused));
 #endif
 	dsl_dataset_t *ds;
 	uint64_t obj;
 
 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 
 	/* create and open the MOS (meta-objset) */
 	dp->dp_meta_objset = dmu_objset_create_impl(spa,
 	    NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
 	spa->spa_meta_objset = dp->dp_meta_objset;
 
 	/* create the pool directory */
 	err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
 	ASSERT0(err);
 
 	/* Initialize scan structures */
 	VERIFY0(dsl_scan_init(dp, txg));
 
 	/* create and open the root dir */
 	dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
 	VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 	    NULL, dp, &dp->dp_root_dir));
 
 	/* create and open the meta-objset dir */
 	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
 	VERIFY0(dsl_pool_open_special_dir(dp,
 	    MOS_DIR_NAME, &dp->dp_mos_dir));
 
 	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 		/* create and open the free dir */
 		(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
 		    FREE_DIR_NAME, tx);
 		VERIFY0(dsl_pool_open_special_dir(dp,
 		    FREE_DIR_NAME, &dp->dp_free_dir));
 
 		/* create and open the free_bplist */
 		obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
 		VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
 		VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 		    dp->dp_meta_objset, obj));
 	}
 
 	if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
 		dsl_pool_create_origin(dp, tx);
 
 	/*
 	 * Some features may be needed when creating the root dataset, so we
 	 * create the feature objects here.
 	 */
 	if (spa_version(spa) >= SPA_VERSION_FEATURES)
 		spa_feature_create_zap_objects(spa, tx);
 
 	if (dcp != NULL && dcp->cp_crypt != ZIO_CRYPT_OFF &&
 	    dcp->cp_crypt != ZIO_CRYPT_INHERIT)
 		spa_feature_enable(spa, SPA_FEATURE_ENCRYPTION, tx);
 
 	/* create the root dataset */
 	obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, dcp, 0, tx);
 
 	/* create the root objset */
 	VERIFY0(dsl_dataset_hold_obj_flags(dp, obj,
 	    DS_HOLD_FLAG_DECRYPT, FTAG, &ds));
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	os = dmu_objset_create_impl(dp->dp_spa, ds,
 	    dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 #ifdef _KERNEL
 	zfs_create_fs(os, kcred, zplprops, tx);
 #endif
 	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 
 	dmu_tx_commit(tx);
 
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 
 	return (dp);
 }
 
 /*
  * Account for the meta-objset space in its placeholder dsl_dir.
  */
 void
 dsl_pool_mos_diduse_space(dsl_pool_t *dp,
     int64_t used, int64_t comp, int64_t uncomp)
 {
 	ASSERT3U(comp, ==, uncomp); /* it's all metadata */
 	mutex_enter(&dp->dp_lock);
 	dp->dp_mos_used_delta += used;
 	dp->dp_mos_compressed_delta += comp;
 	dp->dp_mos_uncompressed_delta += uncomp;
 	mutex_exit(&dp->dp_lock);
 }
 
 static void
 dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	dmu_objset_sync(dp->dp_meta_objset, zio, tx);
 	VERIFY0(zio_wait(zio));
 	dmu_objset_sync_done(dp->dp_meta_objset, tx);
 	taskq_wait(dp->dp_sync_taskq);
 	multilist_destroy(&dp->dp_meta_objset->os_synced_dnodes);
 
 	dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
 	spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
 }
 
 static void
 dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
 {
 	ASSERT(MUTEX_HELD(&dp->dp_lock));
 
 	if (delta < 0)
 		ASSERT3U(-delta, <=, dp->dp_dirty_total);
 
 	dp->dp_dirty_total += delta;
 
 	/*
 	 * Note: we signal even when increasing dp_dirty_total.
 	 * This ensures forward progress -- each thread wakes the next waiter.
 	 */
 	if (dp->dp_dirty_total < zfs_dirty_data_max)
 		cv_signal(&dp->dp_spaceavail_cv);
 }
 
 void
 dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg)
 {
 	ASSERT3S(size, >=, 0);
 
 	aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], size);
 	aggsum_add(&dp->dp_wrlog_total, size);
 
 	/* Choose a value slightly bigger than min dirty sync bytes */
 	uint64_t sync_min =
 	    zfs_wrlog_data_max * (zfs_dirty_data_sync_percent + 10) / 200;
 	if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0)
 		txg_kick(dp, txg);
 }
 
 boolean_t
 dsl_pool_need_wrlog_delay(dsl_pool_t *dp)
 {
 	uint64_t delay_min_bytes =
 	    zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100;
 
 	return (aggsum_compare(&dp->dp_wrlog_total, delay_min_bytes) > 0);
 }
 
 static void
 dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg)
 {
 	int64_t delta;
 	delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
 	aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta);
 	aggsum_add(&dp->dp_wrlog_total, delta);
 	/* Compact per-CPU sums after the big change. */
 	(void) aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
 	(void) aggsum_value(&dp->dp_wrlog_total);
 }
 
 #ifdef ZFS_DEBUG
 static boolean_t
 dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
 {
 	spa_t *spa = dp->dp_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 		txg_list_t *tl = &vd->vdev_ms_list;
 		metaslab_t *ms;
 
 		for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms;
 		    ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) {
 			VERIFY(range_tree_is_empty(ms->ms_freeing));
 			VERIFY(range_tree_is_empty(ms->ms_checkpointing));
 		}
 	}
 
 	return (B_TRUE);
 }
 #else
 #define	dsl_early_sync_task_verify(dp, txg) \
 	((void) sizeof (dp), (void) sizeof (txg), B_TRUE)
 #endif
 
 void
 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 {
 	zio_t *rio;	/* root zio for all dirty dataset syncs */
 	dmu_tx_t *tx;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	objset_t *mos = dp->dp_meta_objset;
 	list_t synced_datasets;
 
 	list_create(&synced_datasets, sizeof (dsl_dataset_t),
 	    offsetof(dsl_dataset_t, ds_synced_link));
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	/*
 	 * Run all early sync tasks before writing out any dirty blocks.
 	 * For more info on early sync tasks see block comment in
 	 * dsl_early_sync_task().
 	 */
 	if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) {
 		dsl_sync_task_t *dst;
 
 		ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
 		while ((dst =
 		    txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) {
 			ASSERT(dsl_early_sync_task_verify(dp, txg));
 			dsl_sync_task_sync(dst, tx);
 		}
 		ASSERT(dsl_early_sync_task_verify(dp, txg));
 	}
 
 	/*
 	 * Write out all dirty blocks of dirty datasets. Note, this could
 	 * create a very large (+10k) zio tree.
 	 */
 	rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		/*
 		 * We must not sync any non-MOS datasets twice, because
 		 * we may have taken a snapshot of them.  However, we
 		 * may sync newly-created datasets on pass 2.
 		 */
 		ASSERT(!list_link_active(&ds->ds_synced_link));
 		list_insert_tail(&synced_datasets, ds);
 		dsl_dataset_sync(ds, rio, tx);
 	}
 	VERIFY0(zio_wait(rio));
 
 	/*
 	 * Update the long range free counter after
 	 * we're done syncing user data
 	 */
 	mutex_enter(&dp->dp_lock);
 	ASSERT(spa_sync_pass(dp->dp_spa) == 1 ||
 	    dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0);
 	dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0;
 	mutex_exit(&dp->dp_lock);
 
 	/*
 	 * After the data blocks have been written (ensured by the zio_wait()
 	 * above), update the user/group/project space accounting.  This happens
 	 * in tasks dispatched to dp_sync_taskq, so wait for them before
 	 * continuing.
 	 */
 	for (ds = list_head(&synced_datasets); ds != NULL;
 	    ds = list_next(&synced_datasets, ds)) {
 		dmu_objset_sync_done(ds->ds_objset, tx);
 	}
 	taskq_wait(dp->dp_sync_taskq);
 
 	/*
 	 * Sync the datasets again to push out the changes due to
 	 * userspace updates.  This must be done before we process the
 	 * sync tasks, so that any snapshots will have the correct
 	 * user accounting information (and we won't get confused
 	 * about which blocks are part of the snapshot).
 	 */
 	rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		objset_t *os = ds->ds_objset;
 
 		ASSERT(list_link_active(&ds->ds_synced_link));
 		dmu_buf_rele(ds->ds_dbuf, ds);
 		dsl_dataset_sync(ds, rio, tx);
 
 		/*
 		 * Release any key mappings created by calls to
 		 * dsl_dataset_dirty() from the userquota accounting
 		 * code paths.
 		 */
 		if (os->os_encrypted && !os->os_raw_receive &&
 		    !os->os_next_write_raw[txg & TXG_MASK]) {
 			ASSERT3P(ds->ds_key_mapping, !=, NULL);
 			key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);
 		}
 	}
 	VERIFY0(zio_wait(rio));
 
 	/*
 	 * Now that the datasets have been completely synced, we can
 	 * clean up our in-memory structures accumulated while syncing:
 	 *
 	 *  - move dead blocks from the pending deadlist and livelists
 	 *    to the on-disk versions
 	 *  - release hold from dsl_dataset_dirty()
 	 *  - release key mapping hold from dsl_dataset_dirty()
 	 */
 	while ((ds = list_remove_head(&synced_datasets)) != NULL) {
 		objset_t *os = ds->ds_objset;
 
 		if (os->os_encrypted && !os->os_raw_receive &&
 		    !os->os_next_write_raw[txg & TXG_MASK]) {
 			ASSERT3P(ds->ds_key_mapping, !=, NULL);
 			key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);
 		}
 
 		dsl_dataset_sync_done(ds, tx);
 		dmu_buf_rele(ds->ds_dbuf, ds);
 	}
 
 	while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
 		dsl_dir_sync(dd, tx);
 	}
 
 	/*
 	 * The MOS's space is accounted for in the pool/$MOS
 	 * (dp_mos_dir).  We can't modify the mos while we're syncing
 	 * it, so we remember the deltas and apply them here.
 	 */
 	if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
 	    dp->dp_mos_uncompressed_delta != 0) {
 		dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
 		    dp->dp_mos_used_delta,
 		    dp->dp_mos_compressed_delta,
 		    dp->dp_mos_uncompressed_delta, tx);
 		dp->dp_mos_used_delta = 0;
 		dp->dp_mos_compressed_delta = 0;
 		dp->dp_mos_uncompressed_delta = 0;
 	}
 
 	if (dmu_objset_is_dirty(mos, txg)) {
 		dsl_pool_sync_mos(dp, tx);
 	}
 
 	/*
 	 * We have written all of the accounted dirty data, so our
 	 * dp_space_towrite should now be zero. However, some seldom-used
 	 * code paths do not adhere to this (e.g. dbuf_undirty()). Shore up
 	 * the accounting of any dirtied space now.
 	 *
 	 * Note that, besides any dirty data from datasets, the amount of
 	 * dirty data in the MOS is also accounted by the pool. Therefore,
 	 * we want to do this cleanup after dsl_pool_sync_mos() so we don't
 	 * attempt to update the accounting for the same dirty data twice.
 	 * (i.e. at this point we only update the accounting for the space
 	 * that we know that we "leaked").
 	 */
 	dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
 
 	/*
 	 * If we modify a dataset in the same txg that we want to destroy it,
 	 * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
 	 * dsl_dir_destroy_check() will fail if there are unexpected holds.
 	 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
 	 * and clearing the hold on it) before we process the sync_tasks.
 	 * The MOS data dirtied by the sync_tasks will be synced on the next
 	 * pass.
 	 */
 	if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
 		dsl_sync_task_t *dst;
 		/*
 		 * No more sync tasks should have been added while we
 		 * were syncing.
 		 */
 		ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
 		while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
 			dsl_sync_task_sync(dst, tx);
 	}
 
 	dmu_tx_commit(tx);
 
 	DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
 }
 
 void
 dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 {
 	zilog_t *zilog;
 
 	while ((zilog = txg_list_head(&dp->dp_dirty_zilogs, txg))) {
 		dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 		/*
 		 * We don't remove the zilog from the dp_dirty_zilogs
 		 * list until after we've cleaned it. This ensures that
 		 * callers of zilog_is_dirty() receive an accurate
 		 * answer when they are racing with the spa sync thread.
 		 */
 		zil_clean(zilog, txg);
 		(void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg);
 		ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
 		dmu_buf_rele(ds->ds_dbuf, zilog);
 	}
 
 	dsl_pool_wrlog_clear(dp, txg);
 
 	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 }
 
 /*
  * TRUE if the current thread is the tx_sync_thread or if we
  * are being called from SPA context during pool initialization.
  */
 int
 dsl_pool_sync_context(dsl_pool_t *dp)
 {
 	return (curthread == dp->dp_tx.tx_sync_thread ||
 	    spa_is_initializing(dp->dp_spa) ||
 	    taskq_member(dp->dp_sync_taskq, curthread));
 }
 
 /*
  * This function returns the amount of allocatable space in the pool
  * minus whatever space is currently reserved by ZFS for specific
  * purposes. Specifically:
  *
  * 1] Any reserved SLOP space
  * 2] Any space used by the checkpoint
  * 3] Any space used for deferred frees
  *
  * The latter 2 are especially important because they are needed to
  * rectify the SPA's and DMU's different understanding of how much space
  * is used. Now the DMU is aware of that extra space tracked by the SPA
  * without having to maintain a separate special dir (e.g similar to
  * $MOS, $FREEING, and $LEAKED).
  *
  * Note: By deferred frees here, we mean the frees that were deferred
  * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the
  * segments placed in ms_defer trees during metaslab_sync_done().
  */
 uint64_t
 dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy)
 {
 	spa_t *spa = dp->dp_spa;
 	uint64_t space, resv, adjustedsize;
 	uint64_t spa_deferred_frees =
 	    spa->spa_deferred_bpobj.bpo_phys->bpo_bytes;
 
 	space = spa_get_dspace(spa)
 	    - spa_get_checkpoint_space(spa) - spa_deferred_frees;
 	resv = spa_get_slop_space(spa);
 
 	switch (slop_policy) {
 	case ZFS_SPACE_CHECK_NORMAL:
 		break;
 	case ZFS_SPACE_CHECK_RESERVED:
 		resv >>= 1;
 		break;
 	case ZFS_SPACE_CHECK_EXTRA_RESERVED:
 		resv >>= 2;
 		break;
 	case ZFS_SPACE_CHECK_NONE:
 		resv = 0;
 		break;
 	default:
 		panic("invalid slop policy value: %d", slop_policy);
 		break;
 	}
 	adjustedsize = (space >= resv) ? (space - resv) : 0;
 
 	return (adjustedsize);
 }
 
 uint64_t
 dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy)
 {
 	uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy);
 	uint64_t deferred =
 	    metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
 	uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0;
 	return (quota);
 }
 
 uint64_t
 dsl_pool_deferred_space(dsl_pool_t *dp)
 {
 	return (metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)));
 }
 
 boolean_t
 dsl_pool_need_dirty_delay(dsl_pool_t *dp)
 {
 	uint64_t delay_min_bytes =
 	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
 
 	/*
 	 * We are not taking the dp_lock here and few other places, since torn
 	 * reads are unlikely: on 64-bit systems due to register size and on
 	 * 32-bit due to memory constraints.  Pool-wide locks in hot path may
 	 * be too expensive, while we do not need a precise result here.
 	 */
 	return (dp->dp_dirty_total > delay_min_bytes);
 }
 
 static boolean_t
 dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg)
 {
 	uint64_t dirty_min_bytes =
 	    zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
 	uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
 
 	return (dirty > dirty_min_bytes);
 }
 
 void
 dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 {
 	if (space > 0) {
 		mutex_enter(&dp->dp_lock);
 		dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
 		dsl_pool_dirty_delta(dp, space);
 		boolean_t needsync = !dmu_tx_is_syncing(tx) &&
 		    dsl_pool_need_dirty_sync(dp, tx->tx_txg);
 		mutex_exit(&dp->dp_lock);
 
 		if (needsync)
 			txg_kick(dp, tx->tx_txg);
 	}
 }
 
 void
 dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
 {
 	ASSERT3S(space, >=, 0);
 	if (space == 0)
 		return;
 
 	mutex_enter(&dp->dp_lock);
 	if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
 		/* XXX writing something we didn't dirty? */
 		space = dp->dp_dirty_pertxg[txg & TXG_MASK];
 	}
 	ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
 	dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
 	ASSERT3U(dp->dp_dirty_total, >=, space);
 	dsl_pool_dirty_delta(dp, -space);
 	mutex_exit(&dp->dp_lock);
 }
 
 static int
 upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
 	dmu_tx_t *tx = arg;
 	dsl_dataset_t *ds, *prev = NULL;
 	int err;
 
 	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 	if (err)
 		return (err);
 
 	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		err = dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 		if (err) {
 			dsl_dataset_rele(ds, FTAG);
 			return (err);
 		}
 
 		if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
 			break;
 		dsl_dataset_rele(ds, FTAG);
 		ds = prev;
 		prev = NULL;
 	}
 
 	if (prev == NULL) {
 		prev = dp->dp_origin_snap;
 
 		/*
 		 * The $ORIGIN can't have any data, or the accounting
 		 * will be wrong.
 		 */
 		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-		ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
+		ASSERT0(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(prev)->ds_bp));
 		rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 		/* The origin doesn't get attached to itself */
 		if (ds->ds_object == prev->ds_object) {
 			dsl_dataset_rele(ds, FTAG);
 			return (0);
 		}
 
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
 		dsl_dataset_phys(ds)->ds_prev_snap_txg =
 		    dsl_dataset_phys(prev)->ds_creation_txg;
 
 		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 		dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
 
 		dmu_buf_will_dirty(prev->ds_dbuf, tx);
 		dsl_dataset_phys(prev)->ds_num_children++;
 
 		if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
 			ASSERT(ds->ds_prev == NULL);
 			VERIFY0(dsl_dataset_hold_obj(dp,
 			    dsl_dataset_phys(ds)->ds_prev_snap_obj,
 			    ds, &ds->ds_prev));
 		}
 	}
 
 	ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
 	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
 
 	if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
 		dmu_buf_will_dirty(prev->ds_dbuf, tx);
 		dsl_dataset_phys(prev)->ds_next_clones_obj =
 		    zap_create(dp->dp_meta_objset,
 		    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 	}
 	VERIFY0(zap_add_int(dp->dp_meta_objset,
 	    dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
 
 	dsl_dataset_rele(ds, FTAG);
 	if (prev != dp->dp_origin_snap)
 		dsl_dataset_rele(prev, FTAG);
 	return (0);
 }
 
 void
 dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dp->dp_origin_snap != NULL);
 
 	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
 	    tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
 }
 
 static int
 upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	dmu_tx_t *tx = arg;
 	objset_t *mos = dp->dp_meta_objset;
 
 	if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
 		dsl_dataset_t *origin;
 
 		VERIFY0(dsl_dataset_hold_obj(dp,
 		    dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
 
 		if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
 			dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 			dsl_dir_phys(origin->ds_dir)->dd_clones =
 			    zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
 			    0, tx);
 		}
 
 		VERIFY0(zap_add_int(dp->dp_meta_objset,
 		    dsl_dir_phys(origin->ds_dir)->dd_clones,
 		    ds->ds_object, tx));
 
 		dsl_dataset_rele(origin, FTAG);
 	}
 	return (0);
 }
 
 void
 dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	uint64_t obj;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
 	VERIFY0(dsl_pool_open_special_dir(dp,
 	    FREE_DIR_NAME, &dp->dp_free_dir));
 
 	/*
 	 * We can't use bpobj_alloc(), because spa_version() still
 	 * returns the old version, and we need a new-version bpobj with
 	 * subobj support.  So call dmu_object_alloc() directly.
 	 */
 	obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
 	    SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
 	VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
 	VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
 
 	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 	    upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
 }
 
 void
 dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	uint64_t dsobj;
 	dsl_dataset_t *ds;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dp->dp_origin_snap == NULL);
 	ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
 
 	/* create the origin dir, ds, & snap-ds */
 	dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
 	    NULL, 0, kcred, NULL, tx);
 	VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 	dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
 	VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
 	    dp, &dp->dp_origin_snap));
 	dsl_dataset_rele(ds, FTAG);
 }
 
 taskq_t *
 dsl_pool_zrele_taskq(dsl_pool_t *dp)
 {
 	return (dp->dp_zrele_taskq);
 }
 
 taskq_t *
 dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp)
 {
 	return (dp->dp_unlinked_drain_taskq);
 }
 
 /*
  * Walk through the pool-wide zap object of temporary snapshot user holds
  * and release them.
  */
 void
 dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
 {
 	zap_attribute_t za;
 	zap_cursor_t zc;
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 	nvlist_t *holds;
 
 	if (zapobj == 0)
 		return;
 	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 
 	holds = fnvlist_alloc();
 
 	for (zap_cursor_init(&zc, mos, zapobj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		char *htag;
 		nvlist_t *tags;
 
 		htag = strchr(za.za_name, '-');
 		*htag = '\0';
 		++htag;
 		if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
 			tags = fnvlist_alloc();
 			fnvlist_add_boolean(tags, htag);
 			fnvlist_add_nvlist(holds, za.za_name, tags);
 			fnvlist_free(tags);
 		} else {
 			fnvlist_add_boolean(tags, htag);
 		}
 	}
 	dsl_dataset_user_release_tmp(dp, holds);
 	fnvlist_free(holds);
 	zap_cursor_fini(&zc);
 }
 
 /*
  * Create the pool-wide zap object for storing temporary snapshot holds.
  */
 static void
 dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	objset_t *mos = dp->dp_meta_objset;
 
 	ASSERT(dp->dp_tmp_userrefs_obj == 0);
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
 }
 
 static int
 dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
     const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
 {
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 	char *name;
 	int error;
 
 	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	/*
 	 * If the pool was created prior to SPA_VERSION_USERREFS, the
 	 * zap object for temporary holds might not exist yet.
 	 */
 	if (zapobj == 0) {
 		if (holding) {
 			dsl_pool_user_hold_create_obj(dp, tx);
 			zapobj = dp->dp_tmp_userrefs_obj;
 		} else {
 			return (SET_ERROR(ENOENT));
 		}
 	}
 
 	name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
 	if (holding)
 		error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
 	else
 		error = zap_remove(mos, zapobj, name, tx);
 	kmem_strfree(name);
 
 	return (error);
 }
 
 /*
  * Add a temporary hold for the given dataset object and tag.
  */
 int
 dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
     uint64_t now, dmu_tx_t *tx)
 {
 	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
 }
 
 /*
  * Release a temporary hold for the given dataset object and tag.
  */
 int
 dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
     dmu_tx_t *tx)
 {
 	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0,
 	    tx, B_FALSE));
 }
 
 /*
  * DSL Pool Configuration Lock
  *
  * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
  * creation / destruction / rename / property setting).  It must be held for
  * read to hold a dataset or dsl_dir.  I.e. you must call
  * dsl_pool_config_enter() or dsl_pool_hold() before calling
  * dsl_{dataset,dir}_hold{_obj}.  In most circumstances, the dp_config_rwlock
  * must be held continuously until all datasets and dsl_dirs are released.
  *
  * The only exception to this rule is that if a "long hold" is placed on
  * a dataset, then the dp_config_rwlock may be dropped while the dataset
  * is still held.  The long hold will prevent the dataset from being
  * destroyed -- the destroy will fail with EBUSY.  A long hold can be
  * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
  * (by calling dsl_{dataset,objset}_{try}own{_obj}).
  *
  * Legitimate long-holders (including owners) should be long-running, cancelable
  * tasks that should cause "zfs destroy" to fail.  This includes DMU
  * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
  * "zfs send", and "zfs diff".  There are several other long-holders whose
  * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
  *
  * The usual formula for long-holding would be:
  * dsl_pool_hold()
  * dsl_dataset_hold()
  * ... perform checks ...
  * dsl_dataset_long_hold()
  * dsl_pool_rele()
  * ... perform long-running task ...
  * dsl_dataset_long_rele()
  * dsl_dataset_rele()
  *
  * Note that when the long hold is released, the dataset is still held but
  * the pool is not held.  The dataset may change arbitrarily during this time
  * (e.g. it could be destroyed).  Therefore you shouldn't do anything to the
  * dataset except release it.
  *
  * Operations generally fall somewhere into the following taxonomy:
  *
  *                              Read-Only             Modifying
  *
  *    Dataset Layer / MOS        zfs get             zfs destroy
  *
  *     Individual Dataset         read()                write()
  *
  *
  * Dataset Layer Operations
  *
  * Modifying operations should generally use dsl_sync_task().  The synctask
  * infrastructure enforces proper locking strategy with respect to the
  * dp_config_rwlock.  See the comment above dsl_sync_task() for details.
  *
  * Read-only operations will manually hold the pool, then the dataset, obtain
  * information from the dataset, then release the pool and dataset.
  * dmu_objset_{hold,rele}() are convenience routines that also do the pool
  * hold/rele.
  *
  *
  * Operations On Individual Datasets
  *
  * Objects _within_ an objset should only be modified by the current 'owner'
  * of the objset to prevent incorrect concurrent modification. Thus, use
  * {dmu_objset,dsl_dataset}_own to mark some entity as the current owner,
  * and fail with EBUSY if there is already an owner. The owner can then
  * implement its own locking strategy, independent of the dataset layer's
  * locking infrastructure.
  * (E.g., the ZPL has its own set of locks to control concurrency. A regular
  *  vnop will not reach into the dataset layer).
  *
  * Ideally, objects would also only be read by the objset’s owner, so that we
  * don’t observe state mid-modification.
  * (E.g. the ZPL is creating a new object and linking it into a directory; if
  * you don’t coordinate with the ZPL to hold ZPL-level locks, you could see an
  * intermediate state.  The ioctl level violates this but in pretty benign
  * ways, e.g. reading the zpl props object.)
  */
 
 int
 dsl_pool_hold(const char *name, const void *tag, dsl_pool_t **dp)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(name, &spa, tag);
 	if (error == 0) {
 		*dp = spa_get_dsl(spa);
 		dsl_pool_config_enter(*dp, tag);
 	}
 	return (error);
 }
 
 void
 dsl_pool_rele(dsl_pool_t *dp, const void *tag)
 {
 	dsl_pool_config_exit(dp, tag);
 	spa_close(dp->dp_spa, tag);
 }
 
 void
 dsl_pool_config_enter(dsl_pool_t *dp, const void *tag)
 {
 	/*
 	 * We use a "reentrant" reader-writer lock, but not reentrantly.
 	 *
 	 * The rrwlock can (with the track_all flag) track all reading threads,
 	 * which is very useful for debugging which code path failed to release
 	 * the lock, and for verifying that the *current* thread does hold
 	 * the lock.
 	 *
 	 * (Unlike a rwlock, which knows that N threads hold it for
 	 * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
 	 * if any thread holds it for read, even if this thread doesn't).
 	 */
 	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
 	rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
 }
 
 void
 dsl_pool_config_enter_prio(dsl_pool_t *dp, const void *tag)
 {
 	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
 	rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
 }
 
 void
 dsl_pool_config_exit(dsl_pool_t *dp, const void *tag)
 {
 	rrw_exit(&dp->dp_config_rwlock, tag);
 }
 
 boolean_t
 dsl_pool_config_held(dsl_pool_t *dp)
 {
 	return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
 }
 
 boolean_t
 dsl_pool_config_held_writer(dsl_pool_t *dp)
 {
 	return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
 }
 
 EXPORT_SYMBOL(dsl_pool_config_enter);
 EXPORT_SYMBOL(dsl_pool_config_exit);
 
 /* zfs_dirty_data_max_percent only applied at module load in arc_init(). */
 ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_percent, UINT, ZMOD_RD,
 	"Max percent of RAM allowed to be dirty");
 
 /* zfs_dirty_data_max_max_percent only applied at module load in arc_init(). */
 ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max_percent, UINT, ZMOD_RD,
 	"zfs_dirty_data_max upper bound as % of RAM");
 
 ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, UINT, ZMOD_RW,
 	"Transaction delay threshold");
 
 ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, U64, ZMOD_RW,
 	"Determines the dirty space limit");
 
 ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, U64, ZMOD_RW,
 	"The size limit of write-transaction zil log data");
 
 /* zfs_dirty_data_max_max only applied at module load in arc_init(). */
 ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, U64, ZMOD_RD,
 	"zfs_dirty_data_max upper bound in bytes");
 
 ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, UINT, ZMOD_RW,
 	"Dirty data txg sync threshold as a percentage of zfs_dirty_data_max");
 
 ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, U64, ZMOD_RW,
 	"How quickly delay approaches infinity");
 
 ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW,
 	"Max percent of CPUs that are used per dp_sync_taskq");
 
 ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_minalloc, INT, ZMOD_RW,
 	"Number of taskq entries that are pre-populated");
 
 ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_maxalloc, INT, ZMOD_RW,
 	"Max number of taskq entries that are cached");
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 060a5cc36d70..55e89b89f06a 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -1,5263 +1,5266 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
  * Copyright 2016 Gary Mills
  * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
  * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/dsl_scan.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dnode.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/arc_impl.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/zfs_context.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zil_impl.h>
 #include <sys/zio_checksum.h>
 #include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/range_tree.h>
 #include <sys/dbuf.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
 
 /*
  * Grand theory statement on scan queue sorting
  *
  * Scanning is implemented by recursively traversing all indirection levels
  * in an object and reading all blocks referenced from said objects. This
  * results in us approximately traversing the object from lowest logical
  * offset to the highest. For best performance, we would want the logical
  * blocks to be physically contiguous. However, this is frequently not the
  * case with pools given the allocation patterns of copy-on-write filesystems.
  * So instead, we put the I/Os into a reordering queue and issue them in a
  * way that will most benefit physical disks (LBA-order).
  *
  * Queue management:
  *
  * Ideally, we would want to scan all metadata and queue up all block I/O
  * prior to starting to issue it, because that allows us to do an optimal
  * sorting job. This can however consume large amounts of memory. Therefore
  * we continuously monitor the size of the queues and constrain them to 5%
  * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this
  * limit, we clear out a few of the largest extents at the head of the queues
  * to make room for more scanning. Hopefully, these extents will be fairly
  * large and contiguous, allowing us to approach sequential I/O throughput
  * even without a fully sorted tree.
  *
  * Metadata scanning takes place in dsl_scan_visit(), which is called from
  * dsl_scan_sync() every spa_sync(). If we have either fully scanned all
  * metadata on the pool, or we need to make room in memory because our
  * queues are too large, dsl_scan_visit() is postponed and
  * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies
  * that metadata scanning and queued I/O issuing are mutually exclusive. This
  * allows us to provide maximum sequential I/O throughput for the majority of
  * I/O's issued since sequential I/O performance is significantly negatively
  * impacted if it is interleaved with random I/O.
  *
  * Implementation Notes
  *
  * One side effect of the queued scanning algorithm is that the scanning code
  * needs to be notified whenever a block is freed. This is needed to allow
  * the scanning code to remove these I/Os from the issuing queue. Additionally,
  * we do not attempt to queue gang blocks to be issued sequentially since this
  * is very hard to do and would have an extremely limited performance benefit.
  * Instead, we simply issue gang I/Os as soon as we find them using the legacy
  * algorithm.
  *
  * Backwards compatibility
  *
  * This new algorithm is backwards compatible with the legacy on-disk data
  * structures (and therefore does not require a new feature flag).
  * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan
  * will stop scanning metadata (in logical order) and wait for all outstanding
  * sorted I/O to complete. Once this is done, we write out a checkpoint
  * bookmark, indicating that we have scanned everything logically before it.
  * If the pool is imported on a machine without the new sorting algorithm,
  * the scan simply resumes from the last checkpoint using the legacy algorithm.
  */
 
 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
     const zbookmark_phys_t *);
 
 static scan_cb_t dsl_scan_scrub_cb;
 
 static int scan_ds_queue_compare(const void *a, const void *b);
 static int scan_prefetch_queue_compare(const void *a, const void *b);
 static void scan_ds_queue_clear(dsl_scan_t *scn);
 static void scan_ds_prefetch_queue_clear(dsl_scan_t *scn);
 static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
     uint64_t *txg);
 static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
 static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
 static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
 static uint64_t dsl_scan_count_data_disks(spa_t *spa);
 static void read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb);
 
 extern uint_t zfs_vdev_async_write_active_min_dirty_percent;
 static int zfs_scan_blkstats = 0;
 
 /*
  * 'zpool status' uses bytes processed per pass to report throughput and
  * estimate time remaining.  We define a pass to start when the scanning
  * phase completes for a sequential resilver.  Optionally, this value
  * may be used to reset the pass statistics every N txgs to provide an
  * estimated completion time based on currently observed performance.
  */
 static uint_t zfs_scan_report_txgs = 0;
 
 /*
  * By default zfs will check to ensure it is not over the hard memory
  * limit before each txg. If finer-grained control of this is needed
  * this value can be set to 1 to enable checking before scanning each
  * block.
  */
 static int zfs_scan_strict_mem_lim = B_FALSE;
 
 /*
  * Maximum number of parallelly executed bytes per leaf vdev. We attempt
  * to strike a balance here between keeping the vdev queues full of I/Os
  * at all times and not overflowing the queues to cause long latency,
  * which would cause long txg sync times. No matter what, we will not
  * overload the drives with I/O, since that is protected by
  * zfs_vdev_scrub_max_active.
  */
 static uint64_t zfs_scan_vdev_limit = 16 << 20;
 
 static uint_t zfs_scan_issue_strategy = 0;
 
 /* don't queue & sort zios, go direct */
 static int zfs_scan_legacy = B_FALSE;
 static uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */
 
 /*
  * fill_weight is non-tunable at runtime, so we copy it at module init from
  * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would
  * break queue sorting.
  */
 static uint_t zfs_scan_fill_weight = 3;
 static uint64_t fill_weight;
 
 /* See dsl_scan_should_clear() for details on the memory limit tunables */
 static const uint64_t zfs_scan_mem_lim_min = 16 << 20;	/* bytes */
 static const uint64_t zfs_scan_mem_lim_soft_max = 128 << 20;	/* bytes */
 
 
 /* fraction of physmem */
 static uint_t zfs_scan_mem_lim_fact = 20;
 
 /* fraction of mem lim above */
 static uint_t zfs_scan_mem_lim_soft_fact = 20;
 
 /* minimum milliseconds to scrub per txg */
 static uint_t zfs_scrub_min_time_ms = 1000;
 
 /* minimum milliseconds to obsolete per txg */
 static uint_t zfs_obsolete_min_time_ms = 500;
 
 /* minimum milliseconds to free per txg */
 static uint_t zfs_free_min_time_ms = 1000;
 
 /* minimum milliseconds to resilver per txg */
 static uint_t zfs_resilver_min_time_ms = 3000;
 
 static uint_t zfs_scan_checkpoint_intval = 7200; /* in seconds */
 int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */
 static int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
 static int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
 static const ddt_class_t zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
 /* max number of blocks to free in a single TXG */
 static uint64_t zfs_async_block_max_blocks = UINT64_MAX;
 /* max number of dedup blocks to free in a single TXG */
 static uint64_t zfs_max_async_dedup_frees = 100000;
 
 /* set to disable resilver deferring */
 static int zfs_resilver_disable_defer = B_FALSE;
 
 /*
  * We wait a few txgs after importing a pool to begin scanning so that
  * the import / mounting code isn't held up by scrub / resilver IO.
  * Unfortunately, it is a bit difficult to determine exactly how long
  * this will take since userspace will trigger fs mounts asynchronously
  * and the kernel will create zvol minors asynchronously. As a result,
  * the value provided here is a bit arbitrary, but represents a
  * reasonable estimate of how many txgs it will take to finish fully
  * importing a pool
  */
 #define	SCAN_IMPORT_WAIT_TXGS 		5
 
 #define	DSL_SCAN_IS_SCRUB_RESILVER(scn) \
 	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
 	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
 
 /*
  * Enable/disable the processing of the free_bpobj object.
  */
 static int zfs_free_bpobj_enabled = 1;
 
 /* Error blocks to be scrubbed in one txg. */
 static uint_t zfs_scrub_error_blocks_per_txg = 1 << 12;
 
 /* the order has to match pool_scan_type */
 static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
 	NULL,
 	dsl_scan_scrub_cb,	/* POOL_SCAN_SCRUB */
 	dsl_scan_scrub_cb,	/* POOL_SCAN_RESILVER */
 };
 
 /* In core node for the scn->scn_queue. Represents a dataset to be scanned */
 typedef struct {
 	uint64_t	sds_dsobj;
 	uint64_t	sds_txg;
 	avl_node_t	sds_node;
 } scan_ds_t;
 
 /*
  * This controls what conditions are placed on dsl_scan_sync_state():
  * SYNC_OPTIONAL) write out scn_phys iff scn_queues_pending == 0
  * SYNC_MANDATORY) write out scn_phys always. scn_queues_pending must be 0.
  * SYNC_CACHED) if scn_queues_pending == 0, write out scn_phys. Otherwise
  *	write out the scn_phys_cached version.
  * See dsl_scan_sync_state for details.
  */
 typedef enum {
 	SYNC_OPTIONAL,
 	SYNC_MANDATORY,
 	SYNC_CACHED
 } state_sync_type_t;
 
 /*
  * This struct represents the minimum information needed to reconstruct a
  * zio for sequential scanning. This is useful because many of these will
  * accumulate in the sequential IO queues before being issued, so saving
  * memory matters here.
  */
 typedef struct scan_io {
 	/* fields from blkptr_t */
 	uint64_t		sio_blk_prop;
 	uint64_t		sio_phys_birth;
 	uint64_t		sio_birth;
 	zio_cksum_t		sio_cksum;
 	uint32_t		sio_nr_dvas;
 
 	/* fields from zio_t */
 	uint32_t		sio_flags;
 	zbookmark_phys_t	sio_zb;
 
 	/* members for queue sorting */
 	union {
 		avl_node_t	sio_addr_node; /* link into issuing queue */
 		list_node_t	sio_list_node; /* link for issuing to disk */
 	} sio_nodes;
 
 	/*
 	 * There may be up to SPA_DVAS_PER_BP DVAs here from the bp,
 	 * depending on how many were in the original bp. Only the
 	 * first DVA is really used for sorting and issuing purposes.
 	 * The other DVAs (if provided) simply exist so that the zio
 	 * layer can find additional copies to repair from in the
 	 * event of an error. This array must go at the end of the
 	 * struct to allow this for the variable number of elements.
 	 */
 	dva_t			sio_dva[];
 } scan_io_t;
 
 #define	SIO_SET_OFFSET(sio, x)		DVA_SET_OFFSET(&(sio)->sio_dva[0], x)
 #define	SIO_SET_ASIZE(sio, x)		DVA_SET_ASIZE(&(sio)->sio_dva[0], x)
 #define	SIO_GET_OFFSET(sio)		DVA_GET_OFFSET(&(sio)->sio_dva[0])
 #define	SIO_GET_ASIZE(sio)		DVA_GET_ASIZE(&(sio)->sio_dva[0])
 #define	SIO_GET_END_OFFSET(sio)		\
 	(SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio))
 #define	SIO_GET_MUSED(sio)		\
 	(sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t)))
 
 struct dsl_scan_io_queue {
 	dsl_scan_t	*q_scn; /* associated dsl_scan_t */
 	vdev_t		*q_vd; /* top-level vdev that this queue represents */
 	zio_t		*q_zio; /* scn_zio_root child for waiting on IO */
 
 	/* trees used for sorting I/Os and extents of I/Os */
 	range_tree_t	*q_exts_by_addr;
 	zfs_btree_t	q_exts_by_size;
 	avl_tree_t	q_sios_by_addr;
 	uint64_t	q_sio_memused;
 	uint64_t	q_last_ext_addr;
 
 	/* members for zio rate limiting */
 	uint64_t	q_maxinflight_bytes;
 	uint64_t	q_inflight_bytes;
 	kcondvar_t	q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */
 
 	/* per txg statistics */
 	uint64_t	q_total_seg_size_this_txg;
 	uint64_t	q_segs_this_txg;
 	uint64_t	q_total_zio_size_this_txg;
 	uint64_t	q_zios_this_txg;
 };
 
 /* private data for dsl_scan_prefetch_cb() */
 typedef struct scan_prefetch_ctx {
 	zfs_refcount_t spc_refcnt;	/* refcount for memory management */
 	dsl_scan_t *spc_scn;		/* dsl_scan_t for the pool */
 	boolean_t spc_root;		/* is this prefetch for an objset? */
 	uint8_t spc_indblkshift;	/* dn_indblkshift of current dnode */
 	uint16_t spc_datablkszsec;	/* dn_idatablkszsec of current dnode */
 } scan_prefetch_ctx_t;
 
 /* private data for dsl_scan_prefetch() */
 typedef struct scan_prefetch_issue_ctx {
 	avl_node_t spic_avl_node;	/* link into scn->scn_prefetch_queue */
 	scan_prefetch_ctx_t *spic_spc;	/* spc for the callback */
 	blkptr_t spic_bp;		/* bp to prefetch */
 	zbookmark_phys_t spic_zb;	/* bookmark to prefetch */
 } scan_prefetch_issue_ctx_t;
 
 static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
     const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue);
 static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
     scan_io_t *sio);
 
 static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
 static void scan_io_queues_destroy(dsl_scan_t *scn);
 
 static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP];
 
 /* sio->sio_nr_dvas must be set so we know which cache to free from */
 static void
 sio_free(scan_io_t *sio)
 {
 	ASSERT3U(sio->sio_nr_dvas, >, 0);
 	ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
 
 	kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio);
 }
 
 /* It is up to the caller to set sio->sio_nr_dvas for freeing */
 static scan_io_t *
 sio_alloc(unsigned short nr_dvas)
 {
 	ASSERT3U(nr_dvas, >, 0);
 	ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP);
 
 	return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP));
 }
 
 void
 scan_init(void)
 {
 	/*
 	 * This is used in ext_size_compare() to weight segments
 	 * based on how sparse they are. This cannot be changed
 	 * mid-scan and the tree comparison functions don't currently
 	 * have a mechanism for passing additional context to the
 	 * compare functions. Thus we store this value globally and
 	 * we only allow it to be set at module initialization time
 	 */
 	fill_weight = zfs_scan_fill_weight;
 
 	for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
 		char name[36];
 
 		(void) snprintf(name, sizeof (name), "sio_cache_%d", i);
 		sio_cache[i] = kmem_cache_create(name,
 		    (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))),
 		    0, NULL, NULL, NULL, NULL, NULL, 0);
 	}
 }
 
 void
 scan_fini(void)
 {
 	for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
 		kmem_cache_destroy(sio_cache[i]);
 	}
 }
 
 static inline boolean_t
 dsl_scan_is_running(const dsl_scan_t *scn)
 {
 	return (scn->scn_phys.scn_state == DSS_SCANNING);
 }
 
 boolean_t
 dsl_scan_resilvering(dsl_pool_t *dp)
 {
 	return (dsl_scan_is_running(dp->dp_scan) &&
 	    dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
 }
 
 static inline void
 sio2bp(const scan_io_t *sio, blkptr_t *bp)
 {
 	memset(bp, 0, sizeof (*bp));
 	bp->blk_prop = sio->sio_blk_prop;
-	bp->blk_phys_birth = sio->sio_phys_birth;
-	bp->blk_birth = sio->sio_birth;
+	BP_SET_PHYSICAL_BIRTH(bp, sio->sio_phys_birth);
+	BP_SET_LOGICAL_BIRTH(bp, sio->sio_birth);
 	bp->blk_fill = 1;	/* we always only work with data pointers */
 	bp->blk_cksum = sio->sio_cksum;
 
 	ASSERT3U(sio->sio_nr_dvas, >, 0);
 	ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
 
 	memcpy(bp->blk_dva, sio->sio_dva, sio->sio_nr_dvas * sizeof (dva_t));
 }
 
 static inline void
 bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
 {
 	sio->sio_blk_prop = bp->blk_prop;
-	sio->sio_phys_birth = bp->blk_phys_birth;
-	sio->sio_birth = bp->blk_birth;
+	sio->sio_phys_birth = BP_GET_PHYSICAL_BIRTH(bp);
+	sio->sio_birth = BP_GET_LOGICAL_BIRTH(bp);
 	sio->sio_cksum = bp->blk_cksum;
 	sio->sio_nr_dvas = BP_GET_NDVAS(bp);
 
 	/*
 	 * Copy the DVAs to the sio. We need all copies of the block so
 	 * that the self healing code can use the alternate copies if the
 	 * first is corrupted. We want the DVA at index dva_i to be first
 	 * in the sio since this is the primary one that we want to issue.
 	 */
 	for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) {
 		sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas];
 	}
 }
 
 int
 dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 {
 	int err;
 	dsl_scan_t *scn;
 	spa_t *spa = dp->dp_spa;
 	uint64_t f;
 
 	scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
 	scn->scn_dp = dp;
 
 	/*
 	 * It's possible that we're resuming a scan after a reboot so
 	 * make sure that the scan_async_destroying flag is initialized
 	 * appropriately.
 	 */
 	ASSERT(!scn->scn_async_destroying);
 	scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
 	    SPA_FEATURE_ASYNC_DESTROY);
 
 	/*
 	 * Calculate the max number of in-flight bytes for pool-wide
 	 * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
 	 * Limits for the issuing phase are done per top-level vdev and
 	 * are handled separately.
 	 */
 	scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
 	    zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));
 
 	avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
 	    offsetof(scan_ds_t, sds_node));
 	avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
 	    sizeof (scan_prefetch_issue_ctx_t),
 	    offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
 
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    "scrub_func", sizeof (uint64_t), 1, &f);
 	if (err == 0) {
 		/*
 		 * There was an old-style scrub in progress.  Restart a
 		 * new-style scrub from the beginning.
 		 */
 		scn->scn_restart_txg = txg;
 		zfs_dbgmsg("old-style scrub was in progress for %s; "
 		    "restarting new-style scrub in txg %llu",
 		    spa->spa_name,
 		    (longlong_t)scn->scn_restart_txg);
 
 		/*
 		 * Load the queue obj from the old location so that it
 		 * can be freed by dsl_scan_done().
 		 */
 		(void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    "scrub_queue", sizeof (uint64_t), 1,
 		    &scn->scn_phys.scn_queue_obj);
 	} else {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_ERRORSCRUB, sizeof (uint64_t),
 		    ERRORSCRUB_PHYS_NUMINTS, &scn->errorscrub_phys);
 
 		if (err != 0 && err != ENOENT)
 			return (err);
 
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 		    &scn->scn_phys);
 
 		/*
 		 * Detect if the pool contains the signature of #2094.  If it
 		 * does properly update the scn->scn_phys structure and notify
 		 * the administrator by setting an errata for the pool.
 		 */
 		if (err == EOVERFLOW) {
 			uint64_t zaptmp[SCAN_PHYS_NUMINTS + 1];
 			VERIFY3S(SCAN_PHYS_NUMINTS, ==, 24);
 			VERIFY3S(offsetof(dsl_scan_phys_t, scn_flags), ==,
 			    (23 * sizeof (uint64_t)));
 
 			err = zap_lookup(dp->dp_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN,
 			    sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1, &zaptmp);
 			if (err == 0) {
 				uint64_t overflow = zaptmp[SCAN_PHYS_NUMINTS];
 
 				if (overflow & ~DSL_SCAN_FLAGS_MASK ||
 				    scn->scn_async_destroying) {
 					spa->spa_errata =
 					    ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY;
 					return (EOVERFLOW);
 				}
 
 				memcpy(&scn->scn_phys, zaptmp,
 				    SCAN_PHYS_NUMINTS * sizeof (uint64_t));
 				scn->scn_phys.scn_flags = overflow;
 
 				/* Required scrub already in progress. */
 				if (scn->scn_phys.scn_state == DSS_FINISHED ||
 				    scn->scn_phys.scn_state == DSS_CANCELED)
 					spa->spa_errata =
 					    ZPOOL_ERRATA_ZOL_2094_SCRUB;
 			}
 		}
 
 		if (err == ENOENT)
 			return (0);
 		else if (err)
 			return (err);
 
 		/*
 		 * We might be restarting after a reboot, so jump the issued
 		 * counter to how far we've scanned. We know we're consistent
 		 * up to here.
 		 */
 		scn->scn_issued_before_pass = scn->scn_phys.scn_examined -
 		    scn->scn_phys.scn_skipped;
 
 		if (dsl_scan_is_running(scn) &&
 		    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
 			/*
 			 * A new-type scrub was in progress on an old
 			 * pool, and the pool was accessed by old
 			 * software.  Restart from the beginning, since
 			 * the old software may have changed the pool in
 			 * the meantime.
 			 */
 			scn->scn_restart_txg = txg;
 			zfs_dbgmsg("new-style scrub for %s was modified "
 			    "by old software; restarting in txg %llu",
 			    spa->spa_name,
 			    (longlong_t)scn->scn_restart_txg);
 		} else if (dsl_scan_resilvering(dp)) {
 			/*
 			 * If a resilver is in progress and there are already
 			 * errors, restart it instead of finishing this scan and
 			 * then restarting it. If there haven't been any errors
 			 * then remember that the incore DTL is valid.
 			 */
 			if (scn->scn_phys.scn_errors > 0) {
 				scn->scn_restart_txg = txg;
 				zfs_dbgmsg("resilver can't excise DTL_MISSING "
 				    "when finished; restarting on %s in txg "
 				    "%llu",
 				    spa->spa_name,
 				    (u_longlong_t)scn->scn_restart_txg);
 			} else {
 				/* it's safe to excise DTL when finished */
 				spa->spa_scrub_started = B_TRUE;
 			}
 		}
 	}
 
 	memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
 
 	/* reload the queue into the in-core state */
 	if (scn->scn_phys.scn_queue_obj != 0) {
 		zap_cursor_t zc;
 		zap_attribute_t za;
 
 		for (zap_cursor_init(&zc, dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    (void) zap_cursor_advance(&zc)) {
 			scan_ds_queue_insert(scn,
 			    zfs_strtonum(za.za_name, NULL),
 			    za.za_first_integer);
 		}
 		zap_cursor_fini(&zc);
 	}
 
 	spa_scan_stat_init(spa);
 	vdev_scan_stat_init(spa->spa_root_vdev);
 
 	return (0);
 }
 
 void
 dsl_scan_fini(dsl_pool_t *dp)
 {
 	if (dp->dp_scan != NULL) {
 		dsl_scan_t *scn = dp->dp_scan;
 
 		if (scn->scn_taskq != NULL)
 			taskq_destroy(scn->scn_taskq);
 
 		scan_ds_queue_clear(scn);
 		avl_destroy(&scn->scn_queue);
 		scan_ds_prefetch_queue_clear(scn);
 		avl_destroy(&scn->scn_prefetch_queue);
 
 		kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
 		dp->dp_scan = NULL;
 	}
 }
 
 static boolean_t
 dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	return (scn->scn_restart_txg != 0 &&
 	    scn->scn_restart_txg <= tx->tx_txg);
 }
 
 boolean_t
 dsl_scan_resilver_scheduled(dsl_pool_t *dp)
 {
 	return ((dp->dp_scan && dp->dp_scan->scn_restart_txg != 0) ||
 	    (spa_async_tasks(dp->dp_spa) & SPA_ASYNC_RESILVER));
 }
 
 boolean_t
 dsl_scan_scrubbing(const dsl_pool_t *dp)
 {
 	dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys;
 
 	return (scn_phys->scn_state == DSS_SCANNING &&
 	    scn_phys->scn_func == POOL_SCAN_SCRUB);
 }
 
 boolean_t
 dsl_errorscrubbing(const dsl_pool_t *dp)
 {
 	dsl_errorscrub_phys_t *errorscrub_phys = &dp->dp_scan->errorscrub_phys;
 
 	return (errorscrub_phys->dep_state == DSS_ERRORSCRUBBING &&
 	    errorscrub_phys->dep_func == POOL_SCAN_ERRORSCRUB);
 }
 
 boolean_t
 dsl_errorscrub_is_paused(const dsl_scan_t *scn)
 {
 	return (dsl_errorscrubbing(scn->scn_dp) &&
 	    scn->errorscrub_phys.dep_paused_flags);
 }
 
 boolean_t
 dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
 {
 	return (dsl_scan_scrubbing(scn->scn_dp) &&
 	    scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
 }
 
 static void
 dsl_errorscrub_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	scn->errorscrub_phys.dep_cursor =
 	    zap_cursor_serialize(&scn->errorscrub_cursor);
 
 	VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ERRORSCRUB, sizeof (uint64_t), ERRORSCRUB_PHYS_NUMINTS,
 	    &scn->errorscrub_phys, tx));
 }
 
 static void
 dsl_errorscrub_setup_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 	pool_scan_func_t *funcp = arg;
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 
 	ASSERT(!dsl_scan_is_running(scn));
 	ASSERT(!dsl_errorscrubbing(scn->scn_dp));
 	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
 
 	memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys));
 	scn->errorscrub_phys.dep_func = *funcp;
 	scn->errorscrub_phys.dep_state = DSS_ERRORSCRUBBING;
 	scn->errorscrub_phys.dep_start_time = gethrestime_sec();
 	scn->errorscrub_phys.dep_to_examine = spa_get_last_errlog_size(spa);
 	scn->errorscrub_phys.dep_examined = 0;
 	scn->errorscrub_phys.dep_errors = 0;
 	scn->errorscrub_phys.dep_cursor = 0;
 	zap_cursor_init_serialized(&scn->errorscrub_cursor,
 	    spa->spa_meta_objset, spa->spa_errlog_last,
 	    scn->errorscrub_phys.dep_cursor);
 
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_START);
 
 	dsl_errorscrub_sync_state(scn, tx);
 
 	spa_history_log_internal(spa, "error scrub setup", tx,
 	    "func=%u mintxg=%u maxtxg=%llu",
 	    *funcp, 0, (u_longlong_t)tx->tx_txg);
 }
 
 static int
 dsl_errorscrub_setup_check(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	if (dsl_scan_is_running(scn) || (dsl_errorscrubbing(scn->scn_dp))) {
 		return (SET_ERROR(EBUSY));
 	}
 
 	if (spa_get_last_errlog_size(scn->scn_dp->dp_spa) == 0) {
 		return (ECANCELED);
 	}
 	return (0);
 }
 
 /*
  * Writes out a persistent dsl_scan_phys_t record to the pool directory.
  * Because we can be running in the block sorting algorithm, we do not always
  * want to write out the record, only when it is "safe" to do so. This safety
  * condition is achieved by making sure that the sorting queues are empty
  * (scn_queues_pending == 0). When this condition is not true, the sync'd state
  * is inconsistent with how much actual scanning progress has been made. The
  * kind of sync to be performed is specified by the sync_type argument. If the
  * sync is optional, we only sync if the queues are empty. If the sync is
  * mandatory, we do a hard ASSERT to make sure that the queues are empty. The
  * third possible state is a "cached" sync. This is done in response to:
  * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been
  *	destroyed, so we wouldn't be able to restart scanning from it.
  * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been
  *	superseded by a newer snapshot.
  * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been
  *	swapped with its clone.
  * In all cases, a cached sync simply rewrites the last record we've written,
  * just slightly modified. For the modifications that are performed to the
  * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed,
  * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped.
  */
 static void
 dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
 {
 	int i;
 	spa_t *spa = scn->scn_dp->dp_spa;
 
 	ASSERT(sync_type != SYNC_MANDATORY || scn->scn_queues_pending == 0);
 	if (scn->scn_queues_pending == 0) {
 		for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
 			vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
 			dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue;
 
 			if (q == NULL)
 				continue;
 
 			mutex_enter(&vd->vdev_scan_io_queue_lock);
 			ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL);
 			ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==,
 			    NULL);
 			ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL);
 			mutex_exit(&vd->vdev_scan_io_queue_lock);
 		}
 
 		if (scn->scn_phys.scn_queue_obj != 0)
 			scan_ds_queue_sync(scn, tx);
 		VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 		    &scn->scn_phys, tx));
 		memcpy(&scn->scn_phys_cached, &scn->scn_phys,
 		    sizeof (scn->scn_phys));
 
 		if (scn->scn_checkpointing)
 			zfs_dbgmsg("finish scan checkpoint for %s",
 			    spa->spa_name);
 
 		scn->scn_checkpointing = B_FALSE;
 		scn->scn_last_checkpoint = ddi_get_lbolt();
 	} else if (sync_type == SYNC_CACHED) {
 		VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 		    &scn->scn_phys_cached, tx));
 	}
 }
 
 int
 dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
 
 	if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd) ||
 	    dsl_errorscrubbing(scn->scn_dp))
 		return (SET_ERROR(EBUSY));
 
 	return (0);
 }
 
 void
 dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 	pool_scan_func_t *funcp = arg;
 	dmu_object_type_t ot = 0;
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 
 	ASSERT(!dsl_scan_is_running(scn));
 	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
 	memset(&scn->scn_phys, 0, sizeof (scn->scn_phys));
 
 	/*
 	 * If we are starting a fresh scrub, we erase the error scrub
 	 * information from disk.
 	 */
 	memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys));
 	dsl_errorscrub_sync_state(scn, tx);
 
 	scn->scn_phys.scn_func = *funcp;
 	scn->scn_phys.scn_state = DSS_SCANNING;
 	scn->scn_phys.scn_min_txg = 0;
 	scn->scn_phys.scn_max_txg = tx->tx_txg;
 	scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
 	scn->scn_phys.scn_start_time = gethrestime_sec();
 	scn->scn_phys.scn_errors = 0;
 	scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
 	scn->scn_issued_before_pass = 0;
 	scn->scn_restart_txg = 0;
 	scn->scn_done_txg = 0;
 	scn->scn_last_checkpoint = 0;
 	scn->scn_checkpointing = B_FALSE;
 	spa_scan_stat_init(spa);
 	vdev_scan_stat_init(spa->spa_root_vdev);
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 		scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
 
 		/* rewrite all disk labels */
 		vdev_config_dirty(spa->spa_root_vdev);
 
 		if (vdev_resilver_needed(spa->spa_root_vdev,
 		    &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
 			nvlist_t *aux = fnvlist_alloc();
 			fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
 			    "healing");
 			spa_event_notify(spa, NULL, aux,
 			    ESC_ZFS_RESILVER_START);
 			nvlist_free(aux);
 		} else {
 			spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START);
 		}
 
 		spa->spa_scrub_started = B_TRUE;
 		/*
 		 * If this is an incremental scrub, limit the DDT scrub phase
 		 * to just the auto-ditto class (for correctness); the rest
 		 * of the scrub should go faster using top-down pruning.
 		 */
 		if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
 			scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
 
 		/*
 		 * When starting a resilver clear any existing rebuild state.
 		 * This is required to prevent stale rebuild status from
 		 * being reported when a rebuild is run, then a resilver and
 		 * finally a scrub.  In which case only the scrub status
 		 * should be reported by 'zpool status'.
 		 */
 		if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
 			vdev_t *rvd = spa->spa_root_vdev;
 			for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 				vdev_t *vd = rvd->vdev_child[i];
 				vdev_rebuild_clear_sync(
 				    (void *)(uintptr_t)vd->vdev_id, tx);
 			}
 		}
 	}
 
 	/* back to the generic stuff */
 
 	if (zfs_scan_blkstats) {
 		if (dp->dp_blkstats == NULL) {
 			dp->dp_blkstats =
 			    vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
 		}
 		memset(&dp->dp_blkstats->zab_type, 0,
 		    sizeof (dp->dp_blkstats->zab_type));
 	} else {
 		if (dp->dp_blkstats) {
 			vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 			dp->dp_blkstats = NULL;
 		}
 	}
 
 	if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
 		ot = DMU_OT_ZAP_OTHER;
 
 	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
 	    ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
 
 	memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
 
 	dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
 
 	spa_history_log_internal(spa, "scan setup", tx,
 	    "func=%u mintxg=%llu maxtxg=%llu",
 	    *funcp, (u_longlong_t)scn->scn_phys.scn_min_txg,
 	    (u_longlong_t)scn->scn_phys.scn_max_txg);
 }
 
 /*
  * Called by ZFS_IOC_POOL_SCRUB and ZFS_IOC_POOL_SCAN ioctl to start a scrub,
  * error scrub or resilver. Can also be called to resume a paused scrub or
  * error scrub.
  */
 int
 dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
 {
 	spa_t *spa = dp->dp_spa;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	/*
 	 * Purge all vdev caches and probe all devices.  We do this here
 	 * rather than in sync context because this requires a writer lock
 	 * on the spa_config lock, which we can't do from sync context.  The
 	 * spa_scrub_reopen flag indicates that vdev_open() should not
 	 * attempt to start another scrub.
 	 */
 	spa_vdev_state_enter(spa, SCL_NONE);
 	spa->spa_scrub_reopen = B_TRUE;
 	vdev_reopen(spa->spa_root_vdev);
 	spa->spa_scrub_reopen = B_FALSE;
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	if (func == POOL_SCAN_RESILVER) {
 		dsl_scan_restart_resilver(spa->spa_dsl_pool, 0);
 		return (0);
 	}
 
 	if (func == POOL_SCAN_ERRORSCRUB) {
 		if (dsl_errorscrub_is_paused(dp->dp_scan)) {
 			/*
 			 * got error scrub start cmd, resume paused error scrub.
 			 */
 			int err = dsl_scrub_set_pause_resume(scn->scn_dp,
 			    POOL_SCRUB_NORMAL);
 			if (err == 0) {
 				spa_event_notify(spa, NULL, NULL,
 				    ESC_ZFS_ERRORSCRUB_RESUME);
 				return (ECANCELED);
 			}
 			return (SET_ERROR(err));
 		}
 
 		return (dsl_sync_task(spa_name(dp->dp_spa),
 		    dsl_errorscrub_setup_check, dsl_errorscrub_setup_sync,
 		    &func, 0, ZFS_SPACE_CHECK_RESERVED));
 	}
 
 	if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
 		/* got scrub start cmd, resume paused scrub */
 		int err = dsl_scrub_set_pause_resume(scn->scn_dp,
 		    POOL_SCRUB_NORMAL);
 		if (err == 0) {
 			spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME);
 			return (SET_ERROR(ECANCELED));
 		}
 		return (SET_ERROR(err));
 	}
 
 	return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
 	    dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 static void
 dsl_errorscrub_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 
 	if (complete) {
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_FINISH);
 		spa_history_log_internal(spa, "error scrub done", tx,
 		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 	} else {
 		spa_history_log_internal(spa, "error scrub canceled", tx,
 		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 	}
 
 	scn->errorscrub_phys.dep_state = complete ? DSS_FINISHED : DSS_CANCELED;
 	spa->spa_scrub_active = B_FALSE;
 	spa_errlog_rotate(spa);
 	scn->errorscrub_phys.dep_end_time = gethrestime_sec();
 	zap_cursor_fini(&scn->errorscrub_cursor);
 
 	if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB)
 		spa->spa_errata = 0;
 
 	ASSERT(!dsl_errorscrubbing(scn->scn_dp));
 }
 
 static void
 dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 {
 	static const char *old_names[] = {
 		"scrub_bookmark",
 		"scrub_ddt_bookmark",
 		"scrub_ddt_class_max",
 		"scrub_queue",
 		"scrub_min_txg",
 		"scrub_max_txg",
 		"scrub_func",
 		"scrub_errors",
 		NULL
 	};
 
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 	int i;
 
 	/* Remove any remnants of an old-style scrub. */
 	for (i = 0; old_names[i]; i++) {
 		(void) zap_remove(dp->dp_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
 	}
 
 	if (scn->scn_phys.scn_queue_obj != 0) {
 		VERIFY0(dmu_object_free(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, tx));
 		scn->scn_phys.scn_queue_obj = 0;
 	}
 	scan_ds_queue_clear(scn);
 	scan_ds_prefetch_queue_clear(scn);
 
 	scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
 
 	/*
 	 * If we were "restarted" from a stopped state, don't bother
 	 * with anything else.
 	 */
 	if (!dsl_scan_is_running(scn)) {
 		ASSERT(!scn->scn_is_sorted);
 		return;
 	}
 
 	if (scn->scn_is_sorted) {
 		scan_io_queues_destroy(scn);
 		scn->scn_is_sorted = B_FALSE;
 
 		if (scn->scn_taskq != NULL) {
 			taskq_destroy(scn->scn_taskq);
 			scn->scn_taskq = NULL;
 		}
 	}
 
 	scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED;
 
 	spa_notify_waiters(spa);
 
 	if (dsl_scan_restarting(scn, tx))
 		spa_history_log_internal(spa, "scan aborted, restarting", tx,
 		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 	else if (!complete)
 		spa_history_log_internal(spa, "scan cancelled", tx,
 		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 	else
 		spa_history_log_internal(spa, "scan done", tx,
 		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 		spa->spa_scrub_active = B_FALSE;
 
 		/*
 		 * If the scrub/resilver completed, update all DTLs to
 		 * reflect this.  Whether it succeeded or not, vacate
 		 * all temporary scrub DTLs.
 		 *
 		 * As the scrub does not currently support traversing
 		 * data that have been freed but are part of a checkpoint,
 		 * we don't mark the scrub as done in the DTLs as faults
 		 * may still exist in those vdevs.
 		 */
 		if (complete &&
 		    !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 			vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
 			    scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE);
 
 			if (scn->scn_phys.scn_min_txg) {
 				nvlist_t *aux = fnvlist_alloc();
 				fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
 				    "healing");
 				spa_event_notify(spa, NULL, aux,
 				    ESC_ZFS_RESILVER_FINISH);
 				nvlist_free(aux);
 			} else {
 				spa_event_notify(spa, NULL, NULL,
 				    ESC_ZFS_SCRUB_FINISH);
 			}
 		} else {
 			vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
 			    0, B_TRUE, B_FALSE);
 		}
 		spa_errlog_rotate(spa);
 
 		/*
 		 * Don't clear flag until after vdev_dtl_reassess to ensure that
 		 * DTL_MISSING will get updated when possible.
 		 */
 		spa->spa_scrub_started = B_FALSE;
 
 		/*
 		 * We may have finished replacing a device.
 		 * Let the async thread assess this and handle the detach.
 		 */
 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 
 		/*
 		 * Clear any resilver_deferred flags in the config.
 		 * If there are drives that need resilvering, kick
 		 * off an asynchronous request to start resilver.
 		 * vdev_clear_resilver_deferred() may update the config
 		 * before the resilver can restart. In the event of
 		 * a crash during this period, the spa loading code
 		 * will find the drives that need to be resilvered
 		 * and start the resilver then.
 		 */
 		if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER) &&
 		    vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) {
 			spa_history_log_internal(spa,
 			    "starting deferred resilver", tx, "errors=%llu",
 			    (u_longlong_t)spa_approx_errlog_size(spa));
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 		}
 
 		/* Clear recent error events (i.e. duplicate events tracking) */
 		if (complete)
 			zfs_ereport_clear(spa, NULL);
 	}
 
 	scn->scn_phys.scn_end_time = gethrestime_sec();
 
 	if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB)
 		spa->spa_errata = 0;
 
 	ASSERT(!dsl_scan_is_running(scn));
 }
 
 static int
 dsl_errorscrub_pause_resume_check(void *arg, dmu_tx_t *tx)
 {
 	pool_scrub_cmd_t *cmd = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_scan_t *scn = dp->dp_scan;
 
 	if (*cmd == POOL_SCRUB_PAUSE) {
 		/*
 		 * can't pause a error scrub when there is no in-progress
 		 * error scrub.
 		 */
 		if (!dsl_errorscrubbing(dp))
 			return (SET_ERROR(ENOENT));
 
 		/* can't pause a paused error scrub */
 		if (dsl_errorscrub_is_paused(scn))
 			return (SET_ERROR(EBUSY));
 	} else if (*cmd != POOL_SCRUB_NORMAL) {
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	return (0);
 }
 
 static void
 dsl_errorscrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
 {
 	pool_scrub_cmd_t *cmd = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	spa_t *spa = dp->dp_spa;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	if (*cmd == POOL_SCRUB_PAUSE) {
 		spa->spa_scan_pass_errorscrub_pause = gethrestime_sec();
 		scn->errorscrub_phys.dep_paused_flags = B_TRUE;
 		dsl_errorscrub_sync_state(scn, tx);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_PAUSED);
 	} else {
 		ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
 		if (dsl_errorscrub_is_paused(scn)) {
 			/*
 			 * We need to keep track of how much time we spend
 			 * paused per pass so that we can adjust the error scrub
 			 * rate shown in the output of 'zpool status'.
 			 */
 			spa->spa_scan_pass_errorscrub_spent_paused +=
 			    gethrestime_sec() -
 			    spa->spa_scan_pass_errorscrub_pause;
 
 			spa->spa_scan_pass_errorscrub_pause = 0;
 			scn->errorscrub_phys.dep_paused_flags = B_FALSE;
 
 			zap_cursor_init_serialized(
 			    &scn->errorscrub_cursor,
 			    spa->spa_meta_objset, spa->spa_errlog_last,
 			    scn->errorscrub_phys.dep_cursor);
 
 			dsl_errorscrub_sync_state(scn, tx);
 		}
 	}
 }
 
 static int
 dsl_errorscrub_cancel_check(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 	/* can't cancel a error scrub when there is no one in-progress */
 	if (!dsl_errorscrubbing(scn->scn_dp))
 		return (SET_ERROR(ENOENT));
 	return (0);
 }
 
 static void
 dsl_errorscrub_cancel_sync(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	dsl_errorscrub_done(scn, B_FALSE, tx);
 	dsl_errorscrub_sync_state(scn, tx);
 	spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL,
 	    ESC_ZFS_ERRORSCRUB_ABORT);
 }
 
 static int
 dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	if (!dsl_scan_is_running(scn))
 		return (SET_ERROR(ENOENT));
 	return (0);
 }
 
 static void
 dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	dsl_scan_done(scn, B_FALSE, tx);
 	dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
 	spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT);
 }
 
 int
 dsl_scan_cancel(dsl_pool_t *dp)
 {
 	if (dsl_errorscrubbing(dp)) {
 		return (dsl_sync_task(spa_name(dp->dp_spa),
 		    dsl_errorscrub_cancel_check, dsl_errorscrub_cancel_sync,
 		    NULL, 3, ZFS_SPACE_CHECK_RESERVED));
 	}
 	return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
 	    dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
 }
 
 static int
 dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx)
 {
 	pool_scrub_cmd_t *cmd = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_scan_t *scn = dp->dp_scan;
 
 	if (*cmd == POOL_SCRUB_PAUSE) {
 		/* can't pause a scrub when there is no in-progress scrub */
 		if (!dsl_scan_scrubbing(dp))
 			return (SET_ERROR(ENOENT));
 
 		/* can't pause a paused scrub */
 		if (dsl_scan_is_paused_scrub(scn))
 			return (SET_ERROR(EBUSY));
 	} else if (*cmd != POOL_SCRUB_NORMAL) {
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	return (0);
 }
 
 static void
 dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
 {
 	pool_scrub_cmd_t *cmd = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	spa_t *spa = dp->dp_spa;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	if (*cmd == POOL_SCRUB_PAUSE) {
 		/* can't pause a scrub when there is no in-progress scrub */
 		spa->spa_scan_pass_scrub_pause = gethrestime_sec();
 		scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
 		scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED;
 		dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED);
 		spa_notify_waiters(spa);
 	} else {
 		ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
 		if (dsl_scan_is_paused_scrub(scn)) {
 			/*
 			 * We need to keep track of how much time we spend
 			 * paused per pass so that we can adjust the scrub rate
 			 * shown in the output of 'zpool status'
 			 */
 			spa->spa_scan_pass_scrub_spent_paused +=
 			    gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
 			spa->spa_scan_pass_scrub_pause = 0;
 			scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
 			scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED;
 			dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 		}
 	}
 }
 
 /*
  * Set scrub pause/resume state if it makes sense to do so
  */
 int
 dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
 {
 	if (dsl_errorscrubbing(dp)) {
 		return (dsl_sync_task(spa_name(dp->dp_spa),
 		    dsl_errorscrub_pause_resume_check,
 		    dsl_errorscrub_pause_resume_sync, &cmd, 3,
 		    ZFS_SPACE_CHECK_RESERVED));
 	}
 	return (dsl_sync_task(spa_name(dp->dp_spa),
 	    dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
 	    ZFS_SPACE_CHECK_RESERVED));
 }
 
 
 /* start a new scan, or restart an existing one. */
 void
 dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg)
 {
 	if (txg == 0) {
 		dmu_tx_t *tx;
 		tx = dmu_tx_create_dd(dp->dp_mos_dir);
 		VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
 
 		txg = dmu_tx_get_txg(tx);
 		dp->dp_scan->scn_restart_txg = txg;
 		dmu_tx_commit(tx);
 	} else {
 		dp->dp_scan->scn_restart_txg = txg;
 	}
 	zfs_dbgmsg("restarting resilver for %s at txg=%llu",
 	    dp->dp_spa->spa_name, (longlong_t)txg);
 }
 
 void
 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
 {
 	zio_free(dp->dp_spa, txg, bp);
 }
 
 void
 dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
 {
 	ASSERT(dsl_pool_sync_context(dp));
 	zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
 }
 
 static int
 scan_ds_queue_compare(const void *a, const void *b)
 {
 	const scan_ds_t *sds_a = a, *sds_b = b;
 
 	if (sds_a->sds_dsobj < sds_b->sds_dsobj)
 		return (-1);
 	if (sds_a->sds_dsobj == sds_b->sds_dsobj)
 		return (0);
 	return (1);
 }
 
 static void
 scan_ds_queue_clear(dsl_scan_t *scn)
 {
 	void *cookie = NULL;
 	scan_ds_t *sds;
 	while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) {
 		kmem_free(sds, sizeof (*sds));
 	}
 }
 
 static boolean_t
 scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg)
 {
 	scan_ds_t srch, *sds;
 
 	srch.sds_dsobj = dsobj;
 	sds = avl_find(&scn->scn_queue, &srch, NULL);
 	if (sds != NULL && txg != NULL)
 		*txg = sds->sds_txg;
 	return (sds != NULL);
 }
 
 static void
 scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg)
 {
 	scan_ds_t *sds;
 	avl_index_t where;
 
 	sds = kmem_zalloc(sizeof (*sds), KM_SLEEP);
 	sds->sds_dsobj = dsobj;
 	sds->sds_txg = txg;
 
 	VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL);
 	avl_insert(&scn->scn_queue, sds, where);
 }
 
 static void
 scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj)
 {
 	scan_ds_t srch, *sds;
 
 	srch.sds_dsobj = dsobj;
 
 	sds = avl_find(&scn->scn_queue, &srch, NULL);
 	VERIFY(sds != NULL);
 	avl_remove(&scn->scn_queue, sds);
 	kmem_free(sds, sizeof (*sds));
 }
 
 static void
 scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 	dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ?
 	    DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER;
 
 	ASSERT0(scn->scn_queues_pending);
 	ASSERT(scn->scn_phys.scn_queue_obj != 0);
 
 	VERIFY0(dmu_object_free(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, tx));
 	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot,
 	    DMU_OT_NONE, 0, tx);
 	for (scan_ds_t *sds = avl_first(&scn->scn_queue);
 	    sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) {
 		VERIFY0(zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, sds->sds_dsobj,
 		    sds->sds_txg, tx));
 	}
 }
 
 /*
  * Computes the memory limit state that we're currently in. A sorted scan
  * needs quite a bit of memory to hold the sorting queue, so we need to
  * reasonably constrain the size so it doesn't impact overall system
  * performance. We compute two limits:
  * 1) Hard memory limit: if the amount of memory used by the sorting
  *	queues on a pool gets above this value, we stop the metadata
  *	scanning portion and start issuing the queued up and sorted
  *	I/Os to reduce memory usage.
  *	This limit is calculated as a fraction of physmem (by default 5%).
  *	We constrain the lower bound of the hard limit to an absolute
  *	minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain
  *	the upper bound to 5% of the total pool size - no chance we'll
  *	ever need that much memory, but just to keep the value in check.
  * 2) Soft memory limit: once we hit the hard memory limit, we start
  *	issuing I/O to reduce queue memory usage, but we don't want to
  *	completely empty out the queues, since we might be able to find I/Os
  *	that will fill in the gaps of our non-sequential IOs at some point
  *	in the future. So we stop the issuing of I/Os once the amount of
  *	memory used drops below the soft limit (at which point we stop issuing
  *	I/O and start scanning metadata again).
  *
  *	This limit is calculated by subtracting a fraction of the hard
  *	limit from the hard limit. By default this fraction is 5%, so
  *	the soft limit is 95% of the hard limit. We cap the size of the
  *	difference between the hard and soft limits at an absolute
  *	maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is
  *	sufficient to not cause too frequent switching between the
  *	metadata scan and I/O issue (even at 2k recordsize, 128 MiB's
  *	worth of queues is about 1.2 GiB of on-pool data, so scanning
  *	that should take at least a decent fraction of a second).
  */
 static boolean_t
 dsl_scan_should_clear(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
 	uint64_t alloc, mlim_hard, mlim_soft, mused;
 
 	alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 	alloc += metaslab_class_get_alloc(spa_special_class(spa));
 	alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
 
 	mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE,
 	    zfs_scan_mem_lim_min);
 	mlim_hard = MIN(mlim_hard, alloc / 20);
 	mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact,
 	    zfs_scan_mem_lim_soft_max);
 	mused = 0;
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *tvd = rvd->vdev_child[i];
 		dsl_scan_io_queue_t *queue;
 
 		mutex_enter(&tvd->vdev_scan_io_queue_lock);
 		queue = tvd->vdev_scan_io_queue;
 		if (queue != NULL) {
 			/*
 			 * # of extents in exts_by_addr = # in exts_by_size.
 			 * B-tree efficiency is ~75%, but can be as low as 50%.
 			 */
 			mused += zfs_btree_numnodes(&queue->q_exts_by_size) *
 			    ((sizeof (range_seg_gap_t) + sizeof (uint64_t)) *
 			    3 / 2) + queue->q_sio_memused;
 		}
 		mutex_exit(&tvd->vdev_scan_io_queue_lock);
 	}
 
 	dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused);
 
 	if (mused == 0)
 		ASSERT0(scn->scn_queues_pending);
 
 	/*
 	 * If we are above our hard limit, we need to clear out memory.
 	 * If we are below our soft limit, we need to accumulate sequential IOs.
 	 * Otherwise, we should keep doing whatever we are currently doing.
 	 */
 	if (mused >= mlim_hard)
 		return (B_TRUE);
 	else if (mused < mlim_soft)
 		return (B_FALSE);
 	else
 		return (scn->scn_clearing);
 }
 
 static boolean_t
 dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 {
 	/* we never skip user/group accounting objects */
 	if (zb && (int64_t)zb->zb_object < 0)
 		return (B_FALSE);
 
 	if (scn->scn_suspending)
 		return (B_TRUE); /* we're already suspending */
 
 	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
 		return (B_FALSE); /* we're resuming */
 
 	/* We only know how to resume from level-0 and objset blocks. */
 	if (zb && (zb->zb_level != 0 && zb->zb_level != ZB_ROOT_LEVEL))
 		return (B_FALSE);
 
 	/*
 	 * We suspend if:
 	 *  - we have scanned for at least the minimum time (default 1 sec
 	 *    for scrub, 3 sec for resilver), and either we have sufficient
 	 *    dirty data that we are starting to write more quickly
 	 *    (default 30%), someone is explicitly waiting for this txg
 	 *    to complete, or we have used up all of the time in the txg
 	 *    timeout (default 5 sec).
 	 *  or
 	 *  - the spa is shutting down because this pool is being exported
 	 *    or the machine is rebooting.
 	 *  or
 	 *  - the scan queue has reached its memory use limit
 	 */
 	uint64_t curr_time_ns = gethrtime();
 	uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
 	uint64_t sync_time_ns = curr_time_ns -
 	    scn->scn_dp->dp_spa->spa_sync_starttime;
 	uint64_t dirty_min_bytes = zfs_dirty_data_max *
 	    zfs_vdev_async_write_active_min_dirty_percent / 100;
 	uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
 	    zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
 
 	if ((NSEC2MSEC(scan_time_ns) > mintime &&
 	    (scn->scn_dp->dp_dirty_total >= dirty_min_bytes ||
 	    txg_sync_waiting(scn->scn_dp) ||
 	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa) ||
 	    (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
 		if (zb && zb->zb_level == ZB_ROOT_LEVEL) {
 			dprintf("suspending at first available bookmark "
 			    "%llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
 			    (longlong_t)zb->zb_object,
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
 			SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
 			    zb->zb_objset, 0, 0, 0);
 		} else if (zb != NULL) {
 			dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
 			    (longlong_t)zb->zb_object,
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
 			scn->scn_phys.scn_bookmark = *zb;
 		} else {
 #ifdef ZFS_DEBUG
 			dsl_scan_phys_t *scnp = &scn->scn_phys;
 			dprintf("suspending at at DDT bookmark "
 			    "%llx/%llx/%llx/%llx\n",
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
 #endif
 		}
 		scn->scn_suspending = B_TRUE;
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static boolean_t
 dsl_error_scrub_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 {
 	/*
 	 * We suspend if:
 	 *  - we have scrubbed for at least the minimum time (default 1 sec
 	 *    for error scrub), someone is explicitly waiting for this txg
 	 *    to complete, or we have used up all of the time in the txg
 	 *    timeout (default 5 sec).
 	 *  or
 	 *  - the spa is shutting down because this pool is being exported
 	 *    or the machine is rebooting.
 	 */
 	uint64_t curr_time_ns = gethrtime();
 	uint64_t error_scrub_time_ns = curr_time_ns - scn->scn_sync_start_time;
 	uint64_t sync_time_ns = curr_time_ns -
 	    scn->scn_dp->dp_spa->spa_sync_starttime;
 	int mintime = zfs_scrub_min_time_ms;
 
 	if ((NSEC2MSEC(error_scrub_time_ns) > mintime &&
 	    (txg_sync_waiting(scn->scn_dp) ||
 	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa)) {
 		if (zb) {
 			dprintf("error scrub suspending at bookmark "
 			    "%llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
 			    (longlong_t)zb->zb_object,
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
 		}
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 typedef struct zil_scan_arg {
 	dsl_pool_t	*zsa_dp;
 	zil_header_t	*zsa_zh;
 } zil_scan_arg_t;
 
 static int
 dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
     uint64_t claim_txg)
 {
 	(void) zilog;
 	zil_scan_arg_t *zsa = arg;
 	dsl_pool_t *dp = zsa->zsa_dp;
 	dsl_scan_t *scn = dp->dp_scan;
 	zil_header_t *zh = zsa->zsa_zh;
 	zbookmark_phys_t zb;
 
 	ASSERT(!BP_IS_REDACTED(bp));
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+	if (BP_IS_HOLE(bp) ||
+	    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
 		return (0);
 
 	/*
 	 * One block ("stubby") can be allocated a long time ago; we
 	 * want to visit that one because it has been allocated
 	 * (on-disk) even if it hasn't been claimed (even though for
 	 * scrub there's nothing to do to it).
 	 */
-	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
+	if (claim_txg == 0 &&
+	    BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa))
 		return (0);
 
 	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
 	return (0);
 }
 
 static int
 dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
     uint64_t claim_txg)
 {
 	(void) zilog;
 	if (lrc->lrc_txtype == TX_WRITE) {
 		zil_scan_arg_t *zsa = arg;
 		dsl_pool_t *dp = zsa->zsa_dp;
 		dsl_scan_t *scn = dp->dp_scan;
 		zil_header_t *zh = zsa->zsa_zh;
 		const lr_write_t *lr = (const lr_write_t *)lrc;
 		const blkptr_t *bp = &lr->lr_blkptr;
 		zbookmark_phys_t zb;
 
 		ASSERT(!BP_IS_REDACTED(bp));
 		if (BP_IS_HOLE(bp) ||
-		    bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+		    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
 			return (0);
 
 		/*
 		 * birth can be < claim_txg if this record's txg is
 		 * already txg sync'ed (but this log block contains
 		 * other records that are not synced)
 		 */
-		if (claim_txg == 0 || bp->blk_birth < claim_txg)
+		if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
 			return (0);
 
 		ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
 		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 		    lr->lr_foid, ZB_ZIL_LEVEL,
 		    lr->lr_offset / BP_GET_LSIZE(bp));
 
 		VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
 	}
 	return (0);
 }
 
 static void
 dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
 {
 	uint64_t claim_txg = zh->zh_claim_txg;
 	zil_scan_arg_t zsa = { dp, zh };
 	zilog_t *zilog;
 
 	ASSERT(spa_writeable(dp->dp_spa));
 
 	/*
 	 * We only want to visit blocks that have been claimed but not yet
 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
 	 */
 	if (claim_txg == 0)
 		return;
 
 	zilog = zil_alloc(dp->dp_meta_objset, zh);
 
 	(void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
 	    claim_txg, B_FALSE);
 
 	zil_free(zilog);
 }
 
 /*
  * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea
  * here is to sort the AVL tree by the order each block will be needed.
  */
 static int
 scan_prefetch_queue_compare(const void *a, const void *b)
 {
 	const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b;
 	const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc;
 	const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc;
 
 	return (zbookmark_compare(spc_a->spc_datablkszsec,
 	    spc_a->spc_indblkshift, spc_b->spc_datablkszsec,
 	    spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb));
 }
 
 static void
 scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, const void *tag)
 {
 	if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) {
 		zfs_refcount_destroy(&spc->spc_refcnt);
 		kmem_free(spc, sizeof (scan_prefetch_ctx_t));
 	}
 }
 
 static scan_prefetch_ctx_t *
 scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, const void *tag)
 {
 	scan_prefetch_ctx_t *spc;
 
 	spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP);
 	zfs_refcount_create(&spc->spc_refcnt);
 	zfs_refcount_add(&spc->spc_refcnt, tag);
 	spc->spc_scn = scn;
 	if (dnp != NULL) {
 		spc->spc_datablkszsec = dnp->dn_datablkszsec;
 		spc->spc_indblkshift = dnp->dn_indblkshift;
 		spc->spc_root = B_FALSE;
 	} else {
 		spc->spc_datablkszsec = 0;
 		spc->spc_indblkshift = 0;
 		spc->spc_root = B_TRUE;
 	}
 
 	return (spc);
 }
 
 static void
 scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, const void *tag)
 {
 	zfs_refcount_add(&spc->spc_refcnt, tag);
 }
 
 static void
 scan_ds_prefetch_queue_clear(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	void *cookie = NULL;
 	scan_prefetch_issue_ctx_t *spic = NULL;
 
 	mutex_enter(&spa->spa_scrub_lock);
 	while ((spic = avl_destroy_nodes(&scn->scn_prefetch_queue,
 	    &cookie)) != NULL) {
 		scan_prefetch_ctx_rele(spic->spic_spc, scn);
 		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
 	}
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 static boolean_t
 dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc,
     const zbookmark_phys_t *zb)
 {
 	zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark;
 	dnode_phys_t tmp_dnp;
 	dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp;
 
 	if (zb->zb_objset != last_zb->zb_objset)
 		return (B_TRUE);
 	if ((int64_t)zb->zb_object < 0)
 		return (B_FALSE);
 
 	tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec;
 	tmp_dnp.dn_indblkshift = spc->spc_indblkshift;
 
 	if (zbookmark_subtree_completed(dnp, zb, last_zb))
 		return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 static void
 dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
 {
 	avl_index_t idx;
 	dsl_scan_t *scn = spc->spc_scn;
 	spa_t *spa = scn->scn_dp->dp_spa;
 	scan_prefetch_issue_ctx_t *spic;
 
 	if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp))
 		return;
 
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg ||
+	if (BP_IS_HOLE(bp) ||
+	    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg ||
 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
 	    BP_GET_TYPE(bp) != DMU_OT_OBJSET))
 		return;
 
 	if (dsl_scan_check_prefetch_resume(spc, zb))
 		return;
 
 	scan_prefetch_ctx_add_ref(spc, scn);
 	spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP);
 	spic->spic_spc = spc;
 	spic->spic_bp = *bp;
 	spic->spic_zb = *zb;
 
 	/*
 	 * Add the IO to the queue of blocks to prefetch. This allows us to
 	 * prioritize blocks that we will need first for the main traversal
 	 * thread.
 	 */
 	mutex_enter(&spa->spa_scrub_lock);
 	if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) {
 		/* this block is already queued for prefetch */
 		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
 		scan_prefetch_ctx_rele(spc, scn);
 		mutex_exit(&spa->spa_scrub_lock);
 		return;
 	}
 
 	avl_insert(&scn->scn_prefetch_queue, spic, idx);
 	cv_broadcast(&spa->spa_scrub_io_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 static void
 dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp,
     uint64_t objset, uint64_t object)
 {
 	int i;
 	zbookmark_phys_t zb;
 	scan_prefetch_ctx_t *spc;
 
 	if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
 		return;
 
 	SET_BOOKMARK(&zb, objset, object, 0, 0);
 
 	spc = scan_prefetch_ctx_create(scn, dnp, FTAG);
 
 	for (i = 0; i < dnp->dn_nblkptr; i++) {
 		zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]);
 		zb.zb_blkid = i;
 		dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb);
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		zb.zb_level = 0;
 		zb.zb_blkid = DMU_SPILL_BLKID;
 		dsl_scan_prefetch(spc, DN_SPILL_BLKPTR(dnp), &zb);
 	}
 
 	scan_prefetch_ctx_rele(spc, FTAG);
 }
 
 static void
 dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *private)
 {
 	(void) zio;
 	scan_prefetch_ctx_t *spc = private;
 	dsl_scan_t *scn = spc->spc_scn;
 	spa_t *spa = scn->scn_dp->dp_spa;
 
 	/* broadcast that the IO has completed for rate limiting purposes */
 	mutex_enter(&spa->spa_scrub_lock);
 	ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
 	spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
 	cv_broadcast(&spa->spa_scrub_io_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 
 	/* if there was an error or we are done prefetching, just cleanup */
 	if (buf == NULL || scn->scn_prefetch_stop)
 		goto out;
 
 	if (BP_GET_LEVEL(bp) > 0) {
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		zbookmark_phys_t czb;
 
 		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1, zb->zb_blkid * epb + i);
 			dsl_scan_prefetch(spc, cbp, &czb);
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		dnode_phys_t *cdnp;
 		int i;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 
 		for (i = 0, cdnp = buf->b_data; i < epb;
 		    i += cdnp->dn_extra_slots + 1,
 		    cdnp += cdnp->dn_extra_slots + 1) {
 			dsl_scan_prefetch_dnode(scn, cdnp,
 			    zb->zb_objset, zb->zb_blkid * epb + i);
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		objset_phys_t *osp = buf->b_data;
 
 		dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode,
 		    zb->zb_objset, DMU_META_DNODE_OBJECT);
 
 		if (OBJSET_BUF_HAS_USERUSED(buf)) {
 			if (OBJSET_BUF_HAS_PROJECTUSED(buf)) {
 				dsl_scan_prefetch_dnode(scn,
 				    &osp->os_projectused_dnode, zb->zb_objset,
 				    DMU_PROJECTUSED_OBJECT);
 			}
 			dsl_scan_prefetch_dnode(scn,
 			    &osp->os_groupused_dnode, zb->zb_objset,
 			    DMU_GROUPUSED_OBJECT);
 			dsl_scan_prefetch_dnode(scn,
 			    &osp->os_userused_dnode, zb->zb_objset,
 			    DMU_USERUSED_OBJECT);
 		}
 	}
 
 out:
 	if (buf != NULL)
 		arc_buf_destroy(buf, private);
 	scan_prefetch_ctx_rele(spc, scn);
 }
 
 static void
 dsl_scan_prefetch_thread(void *arg)
 {
 	dsl_scan_t *scn = arg;
 	spa_t *spa = scn->scn_dp->dp_spa;
 	scan_prefetch_issue_ctx_t *spic;
 
 	/* loop until we are told to stop */
 	while (!scn->scn_prefetch_stop) {
 		arc_flags_t flags = ARC_FLAG_NOWAIT |
 		    ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH;
 		int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
 
 		mutex_enter(&spa->spa_scrub_lock);
 
 		/*
 		 * Wait until we have an IO to issue and are not above our
 		 * maximum in flight limit.
 		 */
 		while (!scn->scn_prefetch_stop &&
 		    (avl_numnodes(&scn->scn_prefetch_queue) == 0 ||
 		    spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) {
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 		}
 
 		/* recheck if we should stop since we waited for the cv */
 		if (scn->scn_prefetch_stop) {
 			mutex_exit(&spa->spa_scrub_lock);
 			break;
 		}
 
 		/* remove the prefetch IO from the tree */
 		spic = avl_first(&scn->scn_prefetch_queue);
 		spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp);
 		avl_remove(&scn->scn_prefetch_queue, spic);
 
 		mutex_exit(&spa->spa_scrub_lock);
 
 		if (BP_IS_PROTECTED(&spic->spic_bp)) {
 			ASSERT(BP_GET_TYPE(&spic->spic_bp) == DMU_OT_DNODE ||
 			    BP_GET_TYPE(&spic->spic_bp) == DMU_OT_OBJSET);
 			ASSERT3U(BP_GET_LEVEL(&spic->spic_bp), ==, 0);
 			zio_flags |= ZIO_FLAG_RAW;
 		}
 
 		/* We don't need data L1 buffer since we do not prefetch L0. */
 		blkptr_t *bp = &spic->spic_bp;
 		if (BP_GET_LEVEL(bp) == 1 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
 		    BP_GET_TYPE(bp) != DMU_OT_OBJSET)
 			flags |= ARC_FLAG_NO_BUF;
 
 		/* issue the prefetch asynchronously */
 		(void) arc_read(scn->scn_zio_root, spa, bp,
 		    dsl_scan_prefetch_cb, spic->spic_spc, ZIO_PRIORITY_SCRUB,
 		    zio_flags, &flags, &spic->spic_zb);
 
 		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
 	}
 
 	ASSERT(scn->scn_prefetch_stop);
 
 	/* free any prefetches we didn't get to complete */
 	mutex_enter(&spa->spa_scrub_lock);
 	while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) {
 		avl_remove(&scn->scn_prefetch_queue, spic);
 		scan_prefetch_ctx_rele(spic->spic_spc, scn);
 		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
 	}
 	ASSERT0(avl_numnodes(&scn->scn_prefetch_queue));
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 static boolean_t
 dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
     const zbookmark_phys_t *zb)
 {
 	/*
 	 * We never skip over user/group accounting objects (obj<0)
 	 */
 	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
 	    (int64_t)zb->zb_object >= 0) {
 		/*
 		 * If we already visited this bp & everything below (in
 		 * a prior txg sync), don't bother doing it again.
 		 */
 		if (zbookmark_subtree_completed(dnp, zb,
 		    &scn->scn_phys.scn_bookmark))
 			return (B_TRUE);
 
 		/*
 		 * If we found the block we're trying to resume from, or
 		 * we went past it, zero it out to indicate that it's OK
 		 * to start checking for suspending again.
 		 */
 		if (zbookmark_subtree_tbd(dnp, zb,
 		    &scn->scn_phys.scn_bookmark)) {
 			dprintf("resuming at %llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
 			    (longlong_t)zb->zb_object,
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
 			memset(&scn->scn_phys.scn_bookmark, 0, sizeof (*zb));
 		}
 	}
 	return (B_FALSE);
 }
 
 static void dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
     dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
     dmu_objset_type_t ostype, dmu_tx_t *tx);
 inline __attribute__((always_inline)) static void dsl_scan_visitdnode(
     dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
     dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
 
 /*
  * Return nonzero on i/o error.
  * Return new buf to write out in *bufp.
  */
 inline __attribute__((always_inline)) static int
 dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
     dnode_phys_t *dnp, const blkptr_t *bp,
     const zbookmark_phys_t *zb, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
 	int err;
 
 	ASSERT(!BP_IS_REDACTED(bp));
 
 	/*
 	 * There is an unlikely case of encountering dnodes with contradicting
 	 * dn_bonuslen and DNODE_FLAG_SPILL_BLKPTR flag before in files created
 	 * or modified before commit 4254acb was merged. As it is not possible
 	 * to know which of the two is correct, report an error.
 	 */
 	if (dnp != NULL &&
 	    dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) {
 		scn->scn_phys.scn_errors++;
-		spa_log_error(spa, zb, &bp->blk_birth);
+		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (BP_GET_LEVEL(bp) > 0) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		arc_buf_t *buf;
 
 		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
 		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
 			zbookmark_phys_t czb;
 
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			dsl_scan_visitbp(cbp, &czb, dnp,
 			    ds, scn, ostype, tx);
 		}
 		arc_buf_destroy(buf, &buf);
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		dnode_phys_t *cdnp;
 		int i;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 		arc_buf_t *buf;
 
 		if (BP_IS_PROTECTED(bp)) {
 			ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 			zio_flags |= ZIO_FLAG_RAW;
 		}
 
 		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
 		for (i = 0, cdnp = buf->b_data; i < epb;
 		    i += cdnp->dn_extra_slots + 1,
 		    cdnp += cdnp->dn_extra_slots + 1) {
 			dsl_scan_visitdnode(scn, ds, ostype,
 			    cdnp, zb->zb_blkid * epb + i, tx);
 		}
 
 		arc_buf_destroy(buf, &buf);
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		objset_phys_t *osp;
 		arc_buf_t *buf;
 
 		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
 
 		osp = buf->b_data;
 
 		dsl_scan_visitdnode(scn, ds, osp->os_type,
 		    &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
 
 		if (OBJSET_BUF_HAS_USERUSED(buf)) {
 			/*
 			 * We also always visit user/group/project accounting
 			 * objects, and never skip them, even if we are
 			 * suspending. This is necessary so that the
 			 * space deltas from this txg get integrated.
 			 */
 			if (OBJSET_BUF_HAS_PROJECTUSED(buf))
 				dsl_scan_visitdnode(scn, ds, osp->os_type,
 				    &osp->os_projectused_dnode,
 				    DMU_PROJECTUSED_OBJECT, tx);
 			dsl_scan_visitdnode(scn, ds, osp->os_type,
 			    &osp->os_groupused_dnode,
 			    DMU_GROUPUSED_OBJECT, tx);
 			dsl_scan_visitdnode(scn, ds, osp->os_type,
 			    &osp->os_userused_dnode,
 			    DMU_USERUSED_OBJECT, tx);
 		}
 		arc_buf_destroy(buf, &buf);
 	} else if (!zfs_blkptr_verify(spa, bp,
 	    BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
 		/*
 		 * Sanity check the block pointer contents, this is handled
 		 * by arc_read() for the cases above.
 		 */
 		scn->scn_phys.scn_errors++;
-		spa_log_error(spa, zb, &bp->blk_birth);
+		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
 		return (SET_ERROR(EINVAL));
 	}
 
 	return (0);
 }
 
 inline __attribute__((always_inline)) static void
 dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
     dmu_objset_type_t ostype, dnode_phys_t *dnp,
     uint64_t object, dmu_tx_t *tx)
 {
 	int j;
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		zbookmark_phys_t czb;
 
 		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
 		    dnp->dn_nlevels - 1, j);
 		dsl_scan_visitbp(&dnp->dn_blkptr[j],
 		    &czb, dnp, ds, scn, ostype, tx);
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		zbookmark_phys_t czb;
 		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
 		    0, DMU_SPILL_BLKID);
 		dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp),
 		    &czb, dnp, ds, scn, ostype, tx);
 	}
 }
 
 /*
  * The arguments are in this order because mdb can only print the
  * first 5; we want them to be useful.
  */
 static void
 dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
     dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
     dmu_objset_type_t ostype, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 
 	if (dsl_scan_check_suspend(scn, zb))
 		return;
 
 	if (dsl_scan_check_resume(scn, dnp, zb))
 		return;
 
 	scn->scn_visited_this_txg++;
 
 	if (BP_IS_HOLE(bp)) {
 		scn->scn_holes_this_txg++;
 		return;
 	}
 
 	if (BP_IS_REDACTED(bp)) {
 		ASSERT(dsl_dataset_feature_is_active(ds,
 		    SPA_FEATURE_REDACTED_DATASETS));
 		return;
 	}
 
 	/*
 	 * Check if this block contradicts any filesystem flags.
 	 */
 	spa_feature_t f = SPA_FEATURE_LARGE_BLOCKS;
 	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
 		ASSERT(dsl_dataset_feature_is_active(ds, f));
 
 	f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
 	if (f != SPA_FEATURE_NONE)
 		ASSERT(dsl_dataset_feature_is_active(ds, f));
 
 	f = zio_compress_to_feature(BP_GET_COMPRESS(bp));
 	if (f != SPA_FEATURE_NONE)
 		ASSERT(dsl_dataset_feature_is_active(ds, f));
 
-	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
+	if (BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) {
 		scn->scn_lt_min_this_txg++;
 		return;
 	}
 
 	if (dsl_scan_recurse(scn, ds, ostype, dnp, bp, zb, tx) != 0)
 		return;
 
 	/*
 	 * If dsl_scan_ddt() has already visited this block, it will have
 	 * already done any translations or scrubbing, so don't call the
 	 * callback again.
 	 */
 	if (ddt_class_contains(dp->dp_spa,
 	    scn->scn_phys.scn_ddt_class_max, bp)) {
 		scn->scn_ddt_contained_this_txg++;
 		return;
 	}
 
 	/*
 	 * If this block is from the future (after cur_max_txg), then we
 	 * are doing this on behalf of a deleted snapshot, and we will
 	 * revisit the future block on the next pass of this dataset.
 	 * Don't scan it now unless we need to because something
 	 * under it was modified.
 	 */
-	if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
+	if (BP_GET_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
 		scn->scn_gt_max_this_txg++;
 		return;
 	}
 
 	scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
 }
 
 static void
 dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
     dmu_tx_t *tx)
 {
 	zbookmark_phys_t zb;
 	scan_prefetch_ctx_t *spc;
 
 	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
 	if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) {
 		SET_BOOKMARK(&scn->scn_prefetch_bookmark,
 		    zb.zb_objset, 0, 0, 0);
 	} else {
 		scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark;
 	}
 
 	scn->scn_objsets_visited_this_txg++;
 
 	spc = scan_prefetch_ctx_create(scn, NULL, FTAG);
 	dsl_scan_prefetch(spc, bp, &zb);
 	scan_prefetch_ctx_rele(spc, FTAG);
 
 	dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx);
 
 	dprintf_ds(ds, "finished scan%s", "");
 }
 
 static void
 ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys)
 {
 	if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) {
 		if (ds->ds_is_snapshot) {
 			/*
 			 * Note:
 			 *  - scn_cur_{min,max}_txg stays the same.
 			 *  - Setting the flag is not really necessary if
 			 *    scn_cur_max_txg == scn_max_txg, because there
 			 *    is nothing after this snapshot that we care
 			 *    about.  However, we set it anyway and then
 			 *    ignore it when we retraverse it in
 			 *    dsl_scan_visitds().
 			 */
 			scn_phys->scn_bookmark.zb_objset =
 			    dsl_dataset_phys(ds)->ds_next_snap_obj;
 			zfs_dbgmsg("destroying ds %llu on %s; currently "
 			    "traversing; reset zb_objset to %llu",
 			    (u_longlong_t)ds->ds_object,
 			    ds->ds_dir->dd_pool->dp_spa->spa_name,
 			    (u_longlong_t)dsl_dataset_phys(ds)->
 			    ds_next_snap_obj);
 			scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN;
 		} else {
 			SET_BOOKMARK(&scn_phys->scn_bookmark,
 			    ZB_DESTROYED_OBJSET, 0, 0, 0);
 			zfs_dbgmsg("destroying ds %llu on %s; currently "
 			    "traversing; reset bookmark to -1,0,0,0",
 			    (u_longlong_t)ds->ds_object,
 			    ds->ds_dir->dd_pool->dp_spa->spa_name);
 		}
 	}
 }
 
 /*
  * Invoked when a dataset is destroyed. We need to make sure that:
  *
  * 1) If it is the dataset that was currently being scanned, we write
  *	a new dsl_scan_phys_t and marking the objset reference in it
  *	as destroyed.
  * 2) Remove it from the work queue, if it was present.
  *
  * If the dataset was actually a snapshot, instead of marking the dataset
  * as destroyed, we instead substitute the next snapshot in line.
  */
 void
 dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	uint64_t mintxg;
 
 	if (!dsl_scan_is_running(scn))
 		return;
 
 	ds_destroyed_scn_phys(ds, &scn->scn_phys);
 	ds_destroyed_scn_phys(ds, &scn->scn_phys_cached);
 
 	if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
 		scan_ds_queue_remove(scn, ds->ds_object);
 		if (ds->ds_is_snapshot)
 			scan_ds_queue_insert(scn,
 			    dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg);
 	}
 
 	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
 	    ds->ds_object, &mintxg) == 0) {
 		ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
 		if (ds->ds_is_snapshot) {
 			/*
 			 * We keep the same mintxg; it could be >
 			 * ds_creation_txg if the previous snapshot was
 			 * deleted too.
 			 */
 			VERIFY(zap_add_int_key(dp->dp_meta_objset,
 			    scn->scn_phys.scn_queue_obj,
 			    dsl_dataset_phys(ds)->ds_next_snap_obj,
 			    mintxg, tx) == 0);
 			zfs_dbgmsg("destroying ds %llu on %s; in queue; "
 			    "replacing with %llu",
 			    (u_longlong_t)ds->ds_object,
 			    dp->dp_spa->spa_name,
 			    (u_longlong_t)dsl_dataset_phys(ds)->
 			    ds_next_snap_obj);
 		} else {
 			zfs_dbgmsg("destroying ds %llu on %s; in queue; "
 			    "removing",
 			    (u_longlong_t)ds->ds_object,
 			    dp->dp_spa->spa_name);
 		}
 	}
 
 	/*
 	 * dsl_scan_sync() should be called after this, and should sync
 	 * out our changed state, but just to be safe, do it here.
 	 */
 	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 }
 
 static void
 ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark)
 {
 	if (scn_bookmark->zb_objset == ds->ds_object) {
 		scn_bookmark->zb_objset =
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		zfs_dbgmsg("snapshotting ds %llu on %s; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds->ds_object,
 		    ds->ds_dir->dd_pool->dp_spa->spa_name,
 		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
 	}
 }
 
 /*
  * Called when a dataset is snapshotted. If we were currently traversing
  * this snapshot, we reset our bookmark to point at the newly created
  * snapshot. We also modify our work queue to remove the old snapshot and
  * replace with the new one.
  */
 void
 dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	uint64_t mintxg;
 
 	if (!dsl_scan_is_running(scn))
 		return;
 
 	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
 
 	ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark);
 	ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark);
 
 	if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
 		scan_ds_queue_remove(scn, ds->ds_object);
 		scan_ds_queue_insert(scn,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg);
 	}
 
 	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
 	    ds->ds_object, &mintxg) == 0) {
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
 		VERIFY(zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
 		zfs_dbgmsg("snapshotting ds %llu on %s; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds->ds_object,
 		    dp->dp_spa->spa_name,
 		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
 	}
 
 	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 }
 
 static void
 ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2,
     zbookmark_phys_t *scn_bookmark)
 {
 	if (scn_bookmark->zb_objset == ds1->ds_object) {
 		scn_bookmark->zb_objset = ds2->ds_object;
 		zfs_dbgmsg("clone_swap ds %llu on %s; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds1->ds_object,
 		    ds1->ds_dir->dd_pool->dp_spa->spa_name,
 		    (u_longlong_t)ds2->ds_object);
 	} else if (scn_bookmark->zb_objset == ds2->ds_object) {
 		scn_bookmark->zb_objset = ds1->ds_object;
 		zfs_dbgmsg("clone_swap ds %llu on %s; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds2->ds_object,
 		    ds2->ds_dir->dd_pool->dp_spa->spa_name,
 		    (u_longlong_t)ds1->ds_object);
 	}
 }
 
 /*
  * Called when an origin dataset and its clone are swapped.  If we were
  * currently traversing the dataset, we need to switch to traversing the
  * newly promoted clone.
  */
 void
 dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	uint64_t mintxg1, mintxg2;
 	boolean_t ds1_queued, ds2_queued;
 
 	if (!dsl_scan_is_running(scn))
 		return;
 
 	ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark);
 	ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark);
 
 	/*
 	 * Handle the in-memory scan queue.
 	 */
 	ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1);
 	ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2);
 
 	/* Sanity checking. */
 	if (ds1_queued) {
 		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
 		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 	}
 	if (ds2_queued) {
 		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
 		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 	}
 
 	if (ds1_queued && ds2_queued) {
 		/*
 		 * If both are queued, we don't need to do anything.
 		 * The swapping code below would not handle this case correctly,
 		 * since we can't insert ds2 if it is already there. That's
 		 * because scan_ds_queue_insert() prohibits a duplicate insert
 		 * and panics.
 		 */
 	} else if (ds1_queued) {
 		scan_ds_queue_remove(scn, ds1->ds_object);
 		scan_ds_queue_insert(scn, ds2->ds_object, mintxg1);
 	} else if (ds2_queued) {
 		scan_ds_queue_remove(scn, ds2->ds_object);
 		scan_ds_queue_insert(scn, ds1->ds_object, mintxg2);
 	}
 
 	/*
 	 * Handle the on-disk scan queue.
 	 * The on-disk state is an out-of-date version of the in-memory state,
 	 * so the in-memory and on-disk values for ds1_queued and ds2_queued may
 	 * be different. Therefore we need to apply the swap logic to the
 	 * on-disk state independently of the in-memory state.
 	 */
 	ds1_queued = zap_lookup_int_key(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0;
 	ds2_queued = zap_lookup_int_key(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0;
 
 	/* Sanity checking. */
 	if (ds1_queued) {
 		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
 		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 	}
 	if (ds2_queued) {
 		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
 		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 	}
 
 	if (ds1_queued && ds2_queued) {
 		/*
 		 * If both are queued, we don't need to do anything.
 		 * Alternatively, we could check for EEXIST from
 		 * zap_add_int_key() and back out to the original state, but
 		 * that would be more work than checking for this case upfront.
 		 */
 	} else if (ds1_queued) {
 		VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
 		VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx));
 		zfs_dbgmsg("clone_swap ds %llu on %s; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds1->ds_object,
 		    dp->dp_spa->spa_name,
 		    (u_longlong_t)ds2->ds_object);
 	} else if (ds2_queued) {
 		VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
 		VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx));
 		zfs_dbgmsg("clone_swap ds %llu on %s; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds2->ds_object,
 		    dp->dp_spa->spa_name,
 		    (u_longlong_t)ds1->ds_object);
 	}
 
 	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 }
 
 static int
 enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
 	uint64_t originobj = *(uint64_t *)arg;
 	dsl_dataset_t *ds;
 	int err;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj)
 		return (0);
 
 	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 	if (err)
 		return (err);
 
 	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) {
 		dsl_dataset_t *prev;
 		err = dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 
 		dsl_dataset_rele(ds, FTAG);
 		if (err)
 			return (err);
 		ds = prev;
 	}
 	scan_ds_queue_insert(scn, ds->ds_object,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 static void
 dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	dsl_dataset_t *ds;
 
 	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 
 	if (scn->scn_phys.scn_cur_min_txg >=
 	    scn->scn_phys.scn_max_txg) {
 		/*
 		 * This can happen if this snapshot was created after the
 		 * scan started, and we already completed a previous snapshot
 		 * that was created after the scan started.  This snapshot
 		 * only references blocks with:
 		 *
 		 *	birth < our ds_creation_txg
 		 *	cur_min_txg is no less than ds_creation_txg.
 		 *	We have already visited these blocks.
 		 * or
 		 *	birth > scn_max_txg
 		 *	The scan requested not to visit these blocks.
 		 *
 		 * Subsequent snapshots (and clones) can reference our
 		 * blocks, or blocks with even higher birth times.
 		 * Therefore we do not need to visit them either,
 		 * so we do not add them to the work queue.
 		 *
 		 * Note that checking for cur_min_txg >= cur_max_txg
 		 * is not sufficient, because in that case we may need to
 		 * visit subsequent snapshots.  This happens when min_txg > 0,
 		 * which raises cur_min_txg.  In this case we will visit
 		 * this dataset but skip all of its blocks, because the
 		 * rootbp's birth time is < cur_min_txg.  Then we will
 		 * add the next snapshots/clones to the work queue.
 		 */
 		char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 		dsl_dataset_name(ds, dsname);
 		zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
 		    "cur_min_txg (%llu) >= max_txg (%llu)",
 		    (longlong_t)dsobj, dsname,
 		    (longlong_t)scn->scn_phys.scn_cur_min_txg,
 		    (longlong_t)scn->scn_phys.scn_max_txg);
 		kmem_free(dsname, MAXNAMELEN);
 
 		goto out;
 	}
 
 	/*
 	 * Only the ZIL in the head (non-snapshot) is valid. Even though
 	 * snapshots can have ZIL block pointers (which may be the same
 	 * BP as in the head), they must be ignored. In addition, $ORIGIN
 	 * doesn't have a objset (i.e. its ds_bp is a hole) so we don't
 	 * need to look for a ZIL in it either. So we traverse the ZIL here,
 	 * rather than in scan_recurse(), because the regular snapshot
 	 * block-sharing rules don't apply to it.
 	 */
 	if (!dsl_dataset_is_snapshot(ds) &&
 	    (dp->dp_origin_snap == NULL ||
 	    ds->ds_dir != dp->dp_origin_snap->ds_dir)) {
 		objset_t *os;
 		if (dmu_objset_from_ds(ds, &os) != 0) {
 			goto out;
 		}
 		dsl_scan_zil(dp, &os->os_zil_header);
 	}
 
 	/*
 	 * Iterate over the bps in this ds.
 	 */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 	char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 	dsl_dataset_name(ds, dsname);
 	zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
 	    "suspending=%u",
 	    (longlong_t)dsobj, dsname,
 	    (longlong_t)scn->scn_phys.scn_cur_min_txg,
 	    (longlong_t)scn->scn_phys.scn_cur_max_txg,
 	    (int)scn->scn_suspending);
 	kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
 
 	if (scn->scn_suspending)
 		goto out;
 
 	/*
 	 * We've finished this pass over this dataset.
 	 */
 
 	/*
 	 * If we did not completely visit this dataset, do another pass.
 	 */
 	if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
 		zfs_dbgmsg("incomplete pass on %s; visiting again",
 		    dp->dp_spa->spa_name);
 		scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
 		scan_ds_queue_insert(scn, ds->ds_object,
 		    scn->scn_phys.scn_cur_max_txg);
 		goto out;
 	}
 
 	/*
 	 * Add descendant datasets to work queue.
 	 */
 	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
 		scan_ds_queue_insert(scn,
 		    dsl_dataset_phys(ds)->ds_next_snap_obj,
 		    dsl_dataset_phys(ds)->ds_creation_txg);
 	}
 	if (dsl_dataset_phys(ds)->ds_num_children > 1) {
 		boolean_t usenext = B_FALSE;
 		if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
 			uint64_t count;
 			/*
 			 * A bug in a previous version of the code could
 			 * cause upgrade_clones_cb() to not set
 			 * ds_next_snap_obj when it should, leading to a
 			 * missing entry.  Therefore we can only use the
 			 * next_clones_obj when its count is correct.
 			 */
 			int err = zap_count(dp->dp_meta_objset,
 			    dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
 			if (err == 0 &&
 			    count == dsl_dataset_phys(ds)->ds_num_children - 1)
 				usenext = B_TRUE;
 		}
 
 		if (usenext) {
 			zap_cursor_t zc;
 			zap_attribute_t za;
 			for (zap_cursor_init(&zc, dp->dp_meta_objset,
 			    dsl_dataset_phys(ds)->ds_next_clones_obj);
 			    zap_cursor_retrieve(&zc, &za) == 0;
 			    (void) zap_cursor_advance(&zc)) {
 				scan_ds_queue_insert(scn,
 				    zfs_strtonum(za.za_name, NULL),
 				    dsl_dataset_phys(ds)->ds_creation_txg);
 			}
 			zap_cursor_fini(&zc);
 		} else {
 			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 			    enqueue_clones_cb, &ds->ds_object,
 			    DS_FIND_CHILDREN));
 		}
 	}
 
 out:
 	dsl_dataset_rele(ds, FTAG);
 }
 
 static int
 enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
 	(void) arg;
 	dsl_dataset_t *ds;
 	int err;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 	if (err)
 		return (err);
 
 	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		dsl_dataset_t *prev;
 		err = dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 		if (err) {
 			dsl_dataset_rele(ds, FTAG);
 			return (err);
 		}
 
 		/*
 		 * If this is a clone, we don't need to worry about it for now.
 		 */
 		if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
 			dsl_dataset_rele(ds, FTAG);
 			dsl_dataset_rele(prev, FTAG);
 			return (0);
 		}
 		dsl_dataset_rele(ds, FTAG);
 		ds = prev;
 	}
 
 	scan_ds_queue_insert(scn, ds->ds_object,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 void
 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
     ddt_entry_t *dde, dmu_tx_t *tx)
 {
 	(void) tx;
 	const ddt_key_t *ddk = &dde->dde_key;
 	ddt_phys_t *ddp = dde->dde_phys;
 	blkptr_t bp;
 	zbookmark_phys_t zb = { 0 };
 
 	if (!dsl_scan_is_running(scn))
 		return;
 
 	/*
 	 * This function is special because it is the only thing
 	 * that can add scan_io_t's to the vdev scan queues from
 	 * outside dsl_scan_sync(). For the most part this is ok
 	 * as long as it is called from within syncing context.
 	 * However, dsl_scan_sync() expects that no new sio's will
 	 * be added between when all the work for a scan is done
 	 * and the next txg when the scan is actually marked as
 	 * completed. This check ensures we do not issue new sio's
 	 * during this period.
 	 */
 	if (scn->scn_done_txg != 0)
 		return;
 
 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 		if (ddp->ddp_phys_birth == 0 ||
 		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
 			continue;
 		ddt_bp_create(checksum, ddk, ddp, &bp);
 
 		scn->scn_visited_this_txg++;
 		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
 	}
 }
 
 /*
  * Scrub/dedup interaction.
  *
  * If there are N references to a deduped block, we don't want to scrub it
  * N times -- ideally, we should scrub it exactly once.
  *
  * We leverage the fact that the dde's replication class (ddt_class_t)
  * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
  * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
  *
  * To prevent excess scrubbing, the scrub begins by walking the DDT
  * to find all blocks with refcnt > 1, and scrubs each of these once.
  * Since there are two replication classes which contain blocks with
  * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
  * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
  *
  * There would be nothing more to say if a block's refcnt couldn't change
  * during a scrub, but of course it can so we must account for changes
  * in a block's replication class.
  *
  * Here's an example of what can occur:
  *
  * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
  * when visited during the top-down scrub phase, it will be scrubbed twice.
  * This negates our scrub optimization, but is otherwise harmless.
  *
  * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
  * on each visit during the top-down scrub phase, it will never be scrubbed.
  * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
  * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
  * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
  * while a scrub is in progress, it scrubs the block right then.
  */
 static void
 dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
 	ddt_entry_t dde = {{{{0}}}};
 	int error;
 	uint64_t n = 0;
 
 	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
 		ddt_t *ddt;
 
 		if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
 			break;
 		dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
 		    (longlong_t)ddb->ddb_class,
 		    (longlong_t)ddb->ddb_type,
 		    (longlong_t)ddb->ddb_checksum,
 		    (longlong_t)ddb->ddb_cursor);
 
 		/* There should be no pending changes to the dedup table */
 		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
 		ASSERT(avl_first(&ddt->ddt_tree) == NULL);
 
 		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
 		n++;
 
 		if (dsl_scan_check_suspend(scn, NULL))
 			break;
 	}
 
 	zfs_dbgmsg("scanned %llu ddt entries on %s with class_max = %u; "
 	    "suspending=%u", (longlong_t)n, scn->scn_dp->dp_spa->spa_name,
 	    (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
 
 	ASSERT(error == 0 || error == ENOENT);
 	ASSERT(error != ENOENT ||
 	    ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
 }
 
 static uint64_t
 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
 {
 	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
 	if (ds->ds_is_snapshot)
 		return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
 	return (smt);
 }
 
 static void
 dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	scan_ds_t *sds;
 	dsl_pool_t *dp = scn->scn_dp;
 
 	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
 	    scn->scn_phys.scn_ddt_class_max) {
 		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
 		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
 		dsl_scan_ddt(scn, tx);
 		if (scn->scn_suspending)
 			return;
 	}
 
 	if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
 		/* First do the MOS & ORIGIN */
 
 		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
 		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
 		dsl_scan_visit_rootbp(scn, NULL,
 		    &dp->dp_meta_rootbp, tx);
 		if (scn->scn_suspending)
 			return;
 
 		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
 			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 			    enqueue_cb, NULL, DS_FIND_CHILDREN));
 		} else {
 			dsl_scan_visitds(scn,
 			    dp->dp_origin_snap->ds_object, tx);
 		}
 		ASSERT(!scn->scn_suspending);
 	} else if (scn->scn_phys.scn_bookmark.zb_objset !=
 	    ZB_DESTROYED_OBJSET) {
 		uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset;
 		/*
 		 * If we were suspended, continue from here. Note if the
 		 * ds we were suspended on was deleted, the zb_objset may
 		 * be -1, so we will skip this and find a new objset
 		 * below.
 		 */
 		dsl_scan_visitds(scn, dsobj, tx);
 		if (scn->scn_suspending)
 			return;
 	}
 
 	/*
 	 * In case we suspended right at the end of the ds, zero the
 	 * bookmark so we don't think that we're still trying to resume.
 	 */
 	memset(&scn->scn_phys.scn_bookmark, 0, sizeof (zbookmark_phys_t));
 
 	/*
 	 * Keep pulling things out of the dataset avl queue. Updates to the
 	 * persistent zap-object-as-queue happen only at checkpoints.
 	 */
 	while ((sds = avl_first(&scn->scn_queue)) != NULL) {
 		dsl_dataset_t *ds;
 		uint64_t dsobj = sds->sds_dsobj;
 		uint64_t txg = sds->sds_txg;
 
 		/* dequeue and free the ds from the queue */
 		scan_ds_queue_remove(scn, dsobj);
 		sds = NULL;
 
 		/* set up min / max txg */
 		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 		if (txg != 0) {
 			scn->scn_phys.scn_cur_min_txg =
 			    MAX(scn->scn_phys.scn_min_txg, txg);
 		} else {
 			scn->scn_phys.scn_cur_min_txg =
 			    MAX(scn->scn_phys.scn_min_txg,
 			    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 		}
 		scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
 		dsl_dataset_rele(ds, FTAG);
 
 		dsl_scan_visitds(scn, dsobj, tx);
 		if (scn->scn_suspending)
 			return;
 	}
 
 	/* No more objsets to fetch, we're done */
 	scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET;
 	ASSERT0(scn->scn_suspending);
 }
 
 static uint64_t
 dsl_scan_count_data_disks(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t i, leaves = 0;
 
 	for (i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *vd = rvd->vdev_child[i];
 		if (vd->vdev_islog || vd->vdev_isspare || vd->vdev_isl2cache)
 			continue;
 		leaves += vdev_get_ndisks(vd) - vdev_get_nparity(vd);
 	}
 	return (leaves);
 }
 
 static void
 scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp)
 {
 	int i;
 	uint64_t cur_size = 0;
 
 	for (i = 0; i < BP_GET_NDVAS(bp); i++) {
 		cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]);
 	}
 
 	q->q_total_zio_size_this_txg += cur_size;
 	q->q_zios_this_txg++;
 }
 
 static void
 scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start,
     uint64_t end)
 {
 	q->q_total_seg_size_this_txg += end - start;
 	q->q_segs_this_txg++;
 }
 
 static boolean_t
 scan_io_queue_check_suspend(dsl_scan_t *scn)
 {
 	/* See comment in dsl_scan_check_suspend() */
 	uint64_t curr_time_ns = gethrtime();
 	uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
 	uint64_t sync_time_ns = curr_time_ns -
 	    scn->scn_dp->dp_spa->spa_sync_starttime;
 	uint64_t dirty_min_bytes = zfs_dirty_data_max *
 	    zfs_vdev_async_write_active_min_dirty_percent / 100;
 	uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
 	    zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
 
 	return ((NSEC2MSEC(scan_time_ns) > mintime &&
 	    (scn->scn_dp->dp_dirty_total >= dirty_min_bytes ||
 	    txg_sync_waiting(scn->scn_dp) ||
 	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa));
 }
 
 /*
  * Given a list of scan_io_t's in io_list, this issues the I/Os out to
  * disk. This consumes the io_list and frees the scan_io_t's. This is
  * called when emptying queues, either when we're up against the memory
  * limit or when we have finished scanning. Returns B_TRUE if we stopped
  * processing the list before we finished. Any sios that were not issued
  * will remain in the io_list.
  */
 static boolean_t
 scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
 {
 	dsl_scan_t *scn = queue->q_scn;
 	scan_io_t *sio;
 	boolean_t suspended = B_FALSE;
 
 	while ((sio = list_head(io_list)) != NULL) {
 		blkptr_t bp;
 
 		if (scan_io_queue_check_suspend(scn)) {
 			suspended = B_TRUE;
 			break;
 		}
 
 		sio2bp(sio, &bp);
 		scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
 		    &sio->sio_zb, queue);
 		(void) list_remove_head(io_list);
 		scan_io_queues_update_zio_stats(queue, &bp);
 		sio_free(sio);
 	}
 	return (suspended);
 }
 
 /*
  * This function removes sios from an IO queue which reside within a given
  * range_seg_t and inserts them (in offset order) into a list. Note that
  * we only ever return a maximum of 32 sios at once. If there are more sios
  * to process within this segment that did not make it onto the list we
  * return B_TRUE and otherwise B_FALSE.
  */
 static boolean_t
 scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
 {
 	scan_io_t *srch_sio, *sio, *next_sio;
 	avl_index_t idx;
 	uint_t num_sios = 0;
 	int64_t bytes_issued = 0;
 
 	ASSERT(rs != NULL);
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
 	srch_sio = sio_alloc(1);
 	srch_sio->sio_nr_dvas = 1;
 	SIO_SET_OFFSET(srch_sio, rs_get_start(rs, queue->q_exts_by_addr));
 
 	/*
 	 * The exact start of the extent might not contain any matching zios,
 	 * so if that's the case, examine the next one in the tree.
 	 */
 	sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
 	sio_free(srch_sio);
 
 	if (sio == NULL)
 		sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
 
 	while (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs,
 	    queue->q_exts_by_addr) && num_sios <= 32) {
 		ASSERT3U(SIO_GET_OFFSET(sio), >=, rs_get_start(rs,
 		    queue->q_exts_by_addr));
 		ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs_get_end(rs,
 		    queue->q_exts_by_addr));
 
 		next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
 		avl_remove(&queue->q_sios_by_addr, sio);
 		if (avl_is_empty(&queue->q_sios_by_addr))
 			atomic_add_64(&queue->q_scn->scn_queues_pending, -1);
 		queue->q_sio_memused -= SIO_GET_MUSED(sio);
 
 		bytes_issued += SIO_GET_ASIZE(sio);
 		num_sios++;
 		list_insert_tail(list, sio);
 		sio = next_sio;
 	}
 
 	/*
 	 * We limit the number of sios we process at once to 32 to avoid
 	 * biting off more than we can chew. If we didn't take everything
 	 * in the segment we update it to reflect the work we were able to
 	 * complete. Otherwise, we remove it from the range tree entirely.
 	 */
 	if (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs,
 	    queue->q_exts_by_addr)) {
 		range_tree_adjust_fill(queue->q_exts_by_addr, rs,
 		    -bytes_issued);
 		range_tree_resize_segment(queue->q_exts_by_addr, rs,
 		    SIO_GET_OFFSET(sio), rs_get_end(rs,
 		    queue->q_exts_by_addr) - SIO_GET_OFFSET(sio));
 		queue->q_last_ext_addr = SIO_GET_OFFSET(sio);
 		return (B_TRUE);
 	} else {
 		uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr);
 		uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr);
 		range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart);
 		queue->q_last_ext_addr = -1;
 		return (B_FALSE);
 	}
 }
 
 /*
  * This is called from the queue emptying thread and selects the next
  * extent from which we are to issue I/Os. The behavior of this function
  * depends on the state of the scan, the current memory consumption and
  * whether or not we are performing a scan shutdown.
  * 1) We select extents in an elevator algorithm (LBA-order) if the scan
  * 	needs to perform a checkpoint
  * 2) We select the largest available extent if we are up against the
  * 	memory limit.
  * 3) Otherwise we don't select any extents.
  */
 static range_seg_t *
 scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
 {
 	dsl_scan_t *scn = queue->q_scn;
 	range_tree_t *rt = queue->q_exts_by_addr;
 
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 	ASSERT(scn->scn_is_sorted);
 
 	if (!scn->scn_checkpointing && !scn->scn_clearing)
 		return (NULL);
 
 	/*
 	 * During normal clearing, we want to issue our largest segments
 	 * first, keeping IO as sequential as possible, and leaving the
 	 * smaller extents for later with the hope that they might eventually
 	 * grow to larger sequential segments. However, when the scan is
 	 * checkpointing, no new extents will be added to the sorting queue,
 	 * so the way we are sorted now is as good as it will ever get.
 	 * In this case, we instead switch to issuing extents in LBA order.
 	 */
 	if ((zfs_scan_issue_strategy < 1 && scn->scn_checkpointing) ||
 	    zfs_scan_issue_strategy == 1)
 		return (range_tree_first(rt));
 
 	/*
 	 * Try to continue previous extent if it is not completed yet.  After
 	 * shrink in scan_io_queue_gather() it may no longer be the best, but
 	 * otherwise we leave shorter remnant every txg.
 	 */
 	uint64_t start;
 	uint64_t size = 1ULL << rt->rt_shift;
 	range_seg_t *addr_rs;
 	if (queue->q_last_ext_addr != -1) {
 		start = queue->q_last_ext_addr;
 		addr_rs = range_tree_find(rt, start, size);
 		if (addr_rs != NULL)
 			return (addr_rs);
 	}
 
 	/*
 	 * Nothing to continue, so find new best extent.
 	 */
 	uint64_t *v = zfs_btree_first(&queue->q_exts_by_size, NULL);
 	if (v == NULL)
 		return (NULL);
 	queue->q_last_ext_addr = start = *v << rt->rt_shift;
 
 	/*
 	 * We need to get the original entry in the by_addr tree so we can
 	 * modify it.
 	 */
 	addr_rs = range_tree_find(rt, start, size);
 	ASSERT3P(addr_rs, !=, NULL);
 	ASSERT3U(rs_get_start(addr_rs, rt), ==, start);
 	ASSERT3U(rs_get_end(addr_rs, rt), >, start);
 	return (addr_rs);
 }
 
 static void
 scan_io_queues_run_one(void *arg)
 {
 	dsl_scan_io_queue_t *queue = arg;
 	kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
 	boolean_t suspended = B_FALSE;
 	range_seg_t *rs;
 	scan_io_t *sio;
 	zio_t *zio;
 	list_t sio_list;
 
 	ASSERT(queue->q_scn->scn_is_sorted);
 
 	list_create(&sio_list, sizeof (scan_io_t),
 	    offsetof(scan_io_t, sio_nodes.sio_list_node));
 	zio = zio_null(queue->q_scn->scn_zio_root, queue->q_scn->scn_dp->dp_spa,
 	    NULL, NULL, NULL, ZIO_FLAG_CANFAIL);
 	mutex_enter(q_lock);
 	queue->q_zio = zio;
 
 	/* Calculate maximum in-flight bytes for this vdev. */
 	queue->q_maxinflight_bytes = MAX(1, zfs_scan_vdev_limit *
 	    (vdev_get_ndisks(queue->q_vd) - vdev_get_nparity(queue->q_vd)));
 
 	/* reset per-queue scan statistics for this txg */
 	queue->q_total_seg_size_this_txg = 0;
 	queue->q_segs_this_txg = 0;
 	queue->q_total_zio_size_this_txg = 0;
 	queue->q_zios_this_txg = 0;
 
 	/* loop until we run out of time or sios */
 	while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) {
 		uint64_t seg_start = 0, seg_end = 0;
 		boolean_t more_left;
 
 		ASSERT(list_is_empty(&sio_list));
 
 		/* loop while we still have sios left to process in this rs */
 		do {
 			scan_io_t *first_sio, *last_sio;
 
 			/*
 			 * We have selected which extent needs to be
 			 * processed next. Gather up the corresponding sios.
 			 */
 			more_left = scan_io_queue_gather(queue, rs, &sio_list);
 			ASSERT(!list_is_empty(&sio_list));
 			first_sio = list_head(&sio_list);
 			last_sio = list_tail(&sio_list);
 
 			seg_end = SIO_GET_END_OFFSET(last_sio);
 			if (seg_start == 0)
 				seg_start = SIO_GET_OFFSET(first_sio);
 
 			/*
 			 * Issuing sios can take a long time so drop the
 			 * queue lock. The sio queue won't be updated by
 			 * other threads since we're in syncing context so
 			 * we can be sure that our trees will remain exactly
 			 * as we left them.
 			 */
 			mutex_exit(q_lock);
 			suspended = scan_io_queue_issue(queue, &sio_list);
 			mutex_enter(q_lock);
 
 			if (suspended)
 				break;
 		} while (more_left);
 
 		/* update statistics for debugging purposes */
 		scan_io_queues_update_seg_stats(queue, seg_start, seg_end);
 
 		if (suspended)
 			break;
 	}
 
 	/*
 	 * If we were suspended in the middle of processing,
 	 * requeue any unfinished sios and exit.
 	 */
 	while ((sio = list_remove_head(&sio_list)) != NULL)
 		scan_io_queue_insert_impl(queue, sio);
 
 	queue->q_zio = NULL;
 	mutex_exit(q_lock);
 	zio_nowait(zio);
 	list_destroy(&sio_list);
 }
 
 /*
  * Performs an emptying run on all scan queues in the pool. This just
  * punches out one thread per top-level vdev, each of which processes
  * only that vdev's scan queue. We can parallelize the I/O here because
  * we know that each queue's I/Os only affect its own top-level vdev.
  *
  * This function waits for the queue runs to complete, and must be
  * called from dsl_scan_sync (or in general, syncing context).
  */
 static void
 scan_io_queues_run(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 
 	ASSERT(scn->scn_is_sorted);
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (scn->scn_queues_pending == 0)
 		return;
 
 	if (scn->scn_taskq == NULL) {
 		int nthreads = spa->spa_root_vdev->vdev_children;
 
 		/*
 		 * We need to make this taskq *always* execute as many
 		 * threads in parallel as we have top-level vdevs and no
 		 * less, otherwise strange serialization of the calls to
 		 * scan_io_queues_run_one can occur during spa_sync runs
 		 * and that significantly impacts performance.
 		 */
 		scn->scn_taskq = taskq_create("dsl_scan_iss", nthreads,
 		    minclsyspri, nthreads, nthreads, TASKQ_PREPOPULATE);
 	}
 
 	for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
 
 		mutex_enter(&vd->vdev_scan_io_queue_lock);
 		if (vd->vdev_scan_io_queue != NULL) {
 			VERIFY(taskq_dispatch(scn->scn_taskq,
 			    scan_io_queues_run_one, vd->vdev_scan_io_queue,
 			    TQ_SLEEP) != TASKQID_INVALID);
 		}
 		mutex_exit(&vd->vdev_scan_io_queue_lock);
 	}
 
 	/*
 	 * Wait for the queues to finish issuing their IOs for this run
 	 * before we return. There may still be IOs in flight at this
 	 * point.
 	 */
 	taskq_wait(scn->scn_taskq);
 }
 
 static boolean_t
 dsl_scan_async_block_should_pause(dsl_scan_t *scn)
 {
 	uint64_t elapsed_nanosecs;
 
 	if (zfs_recover)
 		return (B_FALSE);
 
 	if (zfs_async_block_max_blocks != 0 &&
 	    scn->scn_visited_this_txg >= zfs_async_block_max_blocks) {
 		return (B_TRUE);
 	}
 
 	if (zfs_max_async_dedup_frees != 0 &&
 	    scn->scn_dedup_frees_this_txg >= zfs_max_async_dedup_frees) {
 		return (B_TRUE);
 	}
 
 	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
 	return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
 	    (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms &&
 	    txg_sync_waiting(scn->scn_dp)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa));
 }
 
 static int
 dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = arg;
 
 	if (!scn->scn_is_bptree ||
 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
 		if (dsl_scan_async_block_should_pause(scn))
 			return (SET_ERROR(ERESTART));
 	}
 
 	zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
 	    dmu_tx_get_txg(tx), bp, 0));
 	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
 	    -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
 	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
 	scn->scn_visited_this_txg++;
 	if (BP_GET_DEDUP(bp))
 		scn->scn_dedup_frees_this_txg++;
 	return (0);
 }
 
 static void
 dsl_scan_update_stats(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	uint64_t i;
 	uint64_t seg_size_total = 0, zio_size_total = 0;
 	uint64_t seg_count_total = 0, zio_count_total = 0;
 
 	for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
 		dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue;
 
 		if (queue == NULL)
 			continue;
 
 		seg_size_total += queue->q_total_seg_size_this_txg;
 		zio_size_total += queue->q_total_zio_size_this_txg;
 		seg_count_total += queue->q_segs_this_txg;
 		zio_count_total += queue->q_zios_this_txg;
 	}
 
 	if (seg_count_total == 0 || zio_count_total == 0) {
 		scn->scn_avg_seg_size_this_txg = 0;
 		scn->scn_avg_zio_size_this_txg = 0;
 		scn->scn_segs_this_txg = 0;
 		scn->scn_zios_this_txg = 0;
 		return;
 	}
 
 	scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total;
 	scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total;
 	scn->scn_segs_this_txg = seg_count_total;
 	scn->scn_zios_this_txg = zio_count_total;
 }
 
 static int
 bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT(!bp_freed);
 	return (dsl_scan_free_block_cb(arg, bp, tx));
 }
 
 static int
 dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT(!bp_freed);
 	dsl_scan_t *scn = arg;
 	const dva_t *dva = &bp->blk_dva[0];
 
 	if (dsl_scan_async_block_should_pause(scn))
 		return (SET_ERROR(ERESTART));
 
 	spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa,
 	    DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva),
 	    DVA_GET_ASIZE(dva), tx);
 	scn->scn_visited_this_txg++;
 	return (0);
 }
 
 boolean_t
 dsl_scan_active(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	uint64_t used = 0, comp, uncomp;
 	boolean_t clones_left;
 
 	if (spa->spa_load_state != SPA_LOAD_NONE)
 		return (B_FALSE);
 	if (spa_shutting_down(spa))
 		return (B_FALSE);
 	if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) ||
 	    (scn->scn_async_destroying && !scn->scn_async_stalled))
 		return (B_TRUE);
 
 	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
 		    &used, &comp, &uncomp);
 	}
 	clones_left = spa_livelist_delete_check(spa);
 	return ((used != 0) || (clones_left));
 }
 
 boolean_t
 dsl_errorscrub_active(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	if (spa->spa_load_state != SPA_LOAD_NONE)
 		return (B_FALSE);
 	if (spa_shutting_down(spa))
 		return (B_FALSE);
 	if (dsl_errorscrubbing(scn->scn_dp))
 		return (B_TRUE);
 	return (B_FALSE);
 }
 
 static boolean_t
 dsl_scan_check_deferred(vdev_t *vd)
 {
 	boolean_t need_resilver = B_FALSE;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		need_resilver |=
 		    dsl_scan_check_deferred(vd->vdev_child[c]);
 	}
 
 	if (!vdev_is_concrete(vd) || vd->vdev_aux ||
 	    !vd->vdev_ops->vdev_op_leaf)
 		return (need_resilver);
 
 	if (!vd->vdev_resilver_deferred)
 		need_resilver = B_TRUE;
 
 	return (need_resilver);
 }
 
 static boolean_t
 dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	vdev_t *vd;
 
 	vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
 
 	if (vd->vdev_ops == &vdev_indirect_ops) {
 		/*
 		 * The indirect vdev can point to multiple
 		 * vdevs.  For simplicity, always create
 		 * the resilver zio_t. zio_vdev_io_start()
 		 * will bypass the child resilver i/o's if
 		 * they are on vdevs that don't have DTL's.
 		 */
 		return (B_TRUE);
 	}
 
 	if (DVA_GET_GANG(dva)) {
 		/*
 		 * Gang members may be spread across multiple
 		 * vdevs, so the best estimate we have is the
 		 * scrub range, which has already been checked.
 		 * XXX -- it would be better to change our
 		 * allocation policy to ensure that all
 		 * gang members reside on the same vdev.
 		 */
 		return (B_TRUE);
 	}
 
 	/*
 	 * Check if the top-level vdev must resilver this offset.
 	 * When the offset does not intersect with a dirty leaf DTL
 	 * then it may be possible to skip the resilver IO.  The psize
 	 * is provided instead of asize to simplify the check for RAIDZ.
 	 */
 	if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth))
 		return (B_FALSE);
 
 	/*
 	 * Check that this top-level vdev has a device under it which
 	 * is resilvering and is not deferred.
 	 */
 	if (!dsl_scan_check_deferred(vd))
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 static int
 dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
 	int err = 0;
 
 	if (spa_suspend_async_destroy(spa))
 		return (0);
 
 	if (zfs_free_bpobj_enabled &&
 	    spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 		scn->scn_is_bptree = B_FALSE;
 		scn->scn_async_block_min_time_ms = zfs_free_min_time_ms;
 		scn->scn_zio_root = zio_root(spa, NULL,
 		    NULL, ZIO_FLAG_MUSTSUCCEED);
 		err = bpobj_iterate(&dp->dp_free_bpobj,
 		    bpobj_dsl_scan_free_block_cb, scn, tx);
 		VERIFY0(zio_wait(scn->scn_zio_root));
 		scn->scn_zio_root = NULL;
 
 		if (err != 0 && err != ERESTART)
 			zfs_panic_recover("error %u from bpobj_iterate()", err);
 	}
 
 	if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		ASSERT(scn->scn_async_destroying);
 		scn->scn_is_bptree = B_TRUE;
 		scn->scn_zio_root = zio_root(spa, NULL,
 		    NULL, ZIO_FLAG_MUSTSUCCEED);
 		err = bptree_iterate(dp->dp_meta_objset,
 		    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
 		VERIFY0(zio_wait(scn->scn_zio_root));
 		scn->scn_zio_root = NULL;
 
 		if (err == EIO || err == ECKSUM) {
 			err = 0;
 		} else if (err != 0 && err != ERESTART) {
 			zfs_panic_recover("error %u from "
 			    "traverse_dataset_destroyed()", err);
 		}
 
 		if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
 			/* finished; deactivate async destroy feature */
 			spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
 			ASSERT(!spa_feature_is_active(spa,
 			    SPA_FEATURE_ASYNC_DESTROY));
 			VERIFY0(zap_remove(dp->dp_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT,
 			    DMU_POOL_BPTREE_OBJ, tx));
 			VERIFY0(bptree_free(dp->dp_meta_objset,
 			    dp->dp_bptree_obj, tx));
 			dp->dp_bptree_obj = 0;
 			scn->scn_async_destroying = B_FALSE;
 			scn->scn_async_stalled = B_FALSE;
 		} else {
 			/*
 			 * If we didn't make progress, mark the async
 			 * destroy as stalled, so that we will not initiate
 			 * a spa_sync() on its behalf.  Note that we only
 			 * check this if we are not finished, because if the
 			 * bptree had no blocks for us to visit, we can
 			 * finish without "making progress".
 			 */
 			scn->scn_async_stalled =
 			    (scn->scn_visited_this_txg == 0);
 		}
 	}
 	if (scn->scn_visited_this_txg) {
 		zfs_dbgmsg("freed %llu blocks in %llums from "
 		    "free_bpobj/bptree on %s in txg %llu; err=%u",
 		    (longlong_t)scn->scn_visited_this_txg,
 		    (longlong_t)
 		    NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
 		    spa->spa_name, (longlong_t)tx->tx_txg, err);
 		scn->scn_visited_this_txg = 0;
 		scn->scn_dedup_frees_this_txg = 0;
 
 		/*
 		 * Write out changes to the DDT and the BRT that may be required
 		 * as a result of the blocks freed.  This ensures that the DDT
 		 * and the BRT are clean when a scrub/resilver runs.
 		 */
 		ddt_sync(spa, tx->tx_txg);
 		brt_sync(spa, tx->tx_txg);
 	}
 	if (err != 0)
 		return (err);
 	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
 	    zfs_free_leak_on_eio &&
 	    (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
 	    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 ||
 	    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
 		/*
 		 * We have finished background destroying, but there is still
 		 * some space left in the dp_free_dir. Transfer this leaked
 		 * space to the dp_leak_dir.
 		 */
 		if (dp->dp_leak_dir == NULL) {
 			rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 			(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
 			    LEAK_DIR_NAME, tx);
 			VERIFY0(dsl_pool_open_special_dir(dp,
 			    LEAK_DIR_NAME, &dp->dp_leak_dir));
 			rrw_exit(&dp->dp_config_rwlock, FTAG);
 		}
 		dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
 		    dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
 		    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
 		    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
 		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
 		    -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
 		    -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
 		    -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
 	}
 
 	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
 	    !spa_livelist_delete_check(spa)) {
 		/* finished; verify that space accounting went to zero */
 		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
 		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
 		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
 	}
 
 	spa_notify_waiters(spa);
 
 	EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj),
 	    0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_OBSOLETE_BPOBJ));
 	if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) {
 		ASSERT(spa_feature_is_active(dp->dp_spa,
 		    SPA_FEATURE_OBSOLETE_COUNTS));
 
 		scn->scn_is_bptree = B_FALSE;
 		scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms;
 		err = bpobj_iterate(&dp->dp_obsolete_bpobj,
 		    dsl_scan_obsolete_block_cb, scn, tx);
 		if (err != 0 && err != ERESTART)
 			zfs_panic_recover("error %u from bpobj_iterate()", err);
 
 		if (bpobj_is_empty(&dp->dp_obsolete_bpobj))
 			dsl_pool_destroy_obsolete_bpobj(dp, tx);
 	}
 	return (0);
 }
 
 static void
 name_to_bookmark(char *buf, zbookmark_phys_t *zb)
 {
 	zb->zb_objset = zfs_strtonum(buf, &buf);
 	ASSERT(*buf == ':');
 	zb->zb_object = zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == ':');
 	zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == ':');
 	zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == '\0');
 }
 
 static void
 name_to_object(char *buf, uint64_t *obj)
 {
 	*obj = zfs_strtonum(buf, &buf);
 	ASSERT(*buf == '\0');
 }
 
 static void
 read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	dsl_dataset_t *ds;
 	objset_t *os;
 	if (dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds) != 0)
 		return;
 
 	if (dmu_objset_from_ds(ds, &os) != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return;
 	}
 
 	/*
 	 * If the key is not loaded dbuf_dnode_findbp() will error out with
 	 * EACCES. However in that case dnode_hold() will eventually call
 	 * dbuf_read()->zio_wait() which may call spa_log_error(). This will
 	 * lead to a deadlock due to us holding the mutex spa_errlist_lock.
 	 * Avoid this by checking here if the keys are loaded, if not return.
 	 * If the keys are not loaded the head_errlog feature is meaningless
 	 * as we cannot figure out the birth txg of the block pointer.
 	 */
 	if (dsl_dataset_get_keystatus(ds->ds_dir) ==
 	    ZFS_KEYSTATUS_UNAVAILABLE) {
 		dsl_dataset_rele(ds, FTAG);
 		return;
 	}
 
 	dnode_t *dn;
 	blkptr_t bp;
 
 	if (dnode_hold(os, zb.zb_object, FTAG, &dn) != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return;
 	}
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	int error = dbuf_dnode_findbp(dn, zb.zb_level, zb.zb_blkid, &bp, NULL,
 	    NULL);
 
 	if (error) {
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_rele(dn, FTAG);
 		dsl_dataset_rele(ds, FTAG);
 		return;
 	}
 
 	if (!error && BP_IS_HOLE(&bp)) {
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_rele(dn, FTAG);
 		dsl_dataset_rele(ds, FTAG);
 		return;
 	}
 
 	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW |
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB;
 
 	/* If it's an intent log block, failure is expected. */
 	if (zb.zb_level == ZB_ZIL_LEVEL)
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	ASSERT(!BP_IS_EMBEDDED(&bp));
 	scan_exec_io(dp, &bp, zio_flags, &zb, NULL);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 /*
  * We keep track of the scrubbed error blocks in "count". This will be used
  * when deciding whether we exceeded zfs_scrub_error_blocks_per_txg. This
  * function is modelled after check_filesystem().
  */
 static int
 scrub_filesystem(spa_t *spa, uint64_t fs, zbookmark_err_phys_t *zep,
     int *count)
 {
 	dsl_dataset_t *ds;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	int error = dsl_dataset_hold_obj(dp, fs, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	uint64_t latest_txg;
 	uint64_t txg_to_consider = spa->spa_syncing_txg;
 	boolean_t check_snapshot = B_TRUE;
 
 	error = find_birth_txg(ds, zep, &latest_txg);
 
 	/*
 	 * If find_birth_txg() errors out, then err on the side of caution and
 	 * proceed. In worst case scenario scrub all objects. If zep->zb_birth
 	 * is 0 (e.g. in case of encryption with unloaded keys) also proceed to
 	 * scrub all objects.
 	 */
 	if (error == 0 && zep->zb_birth == latest_txg) {
 		/* Block neither free nor re written. */
 		zbookmark_phys_t zb;
 		zep_to_zb(fs, zep, &zb);
 		scn->scn_zio_root = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 		/* We have already acquired the config lock for spa */
 		read_by_block_level(scn, zb);
 
 		(void) zio_wait(scn->scn_zio_root);
 		scn->scn_zio_root = NULL;
 
 		scn->errorscrub_phys.dep_examined++;
 		scn->errorscrub_phys.dep_to_examine--;
 		(*count)++;
 		if ((*count) == zfs_scrub_error_blocks_per_txg ||
 		    dsl_error_scrub_check_suspend(scn, &zb)) {
 			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(EFAULT));
 		}
 
 		check_snapshot = B_FALSE;
 	} else if (error == 0) {
 		txg_to_consider = latest_txg;
 	}
 
 	/*
 	 * Retrieve the number of snapshots if the dataset is not a snapshot.
 	 */
 	uint64_t snap_count = 0;
 	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
 
 		error = zap_count(spa->spa_meta_objset,
 		    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
 
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 	}
 
 	if (snap_count == 0) {
 		/* Filesystem without snapshots. */
 		dsl_dataset_rele(ds, FTAG);
 		return (0);
 	}
 
 	uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 
 	dsl_dataset_rele(ds, FTAG);
 
 	/* Check only snapshots created from this file system. */
 	while (snap_obj != 0 && zep->zb_birth < snap_obj_txg &&
 	    snap_obj_txg <= txg_to_consider) {
 
 		error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds);
 		if (error != 0)
 			return (error);
 
 		if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != fs) {
 			snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 			snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 			dsl_dataset_rele(ds, FTAG);
 			continue;
 		}
 
 		boolean_t affected = B_TRUE;
 		if (check_snapshot) {
 			uint64_t blk_txg;
 			error = find_birth_txg(ds, zep, &blk_txg);
 
 			/*
 			 * Scrub the snapshot also when zb_birth == 0 or when
 			 * find_birth_txg() returns an error.
 			 */
 			affected = (error == 0 && zep->zb_birth == blk_txg) ||
 			    (error != 0) || (zep->zb_birth == 0);
 		}
 
 		/* Scrub snapshots. */
 		if (affected) {
 			zbookmark_phys_t zb;
 			zep_to_zb(snap_obj, zep, &zb);
 			scn->scn_zio_root = zio_root(spa, NULL, NULL,
 			    ZIO_FLAG_CANFAIL);
 			/* We have already acquired the config lock for spa */
 			read_by_block_level(scn, zb);
 
 			(void) zio_wait(scn->scn_zio_root);
 			scn->scn_zio_root = NULL;
 
 			scn->errorscrub_phys.dep_examined++;
 			scn->errorscrub_phys.dep_to_examine--;
 			(*count)++;
 			if ((*count) == zfs_scrub_error_blocks_per_txg ||
 			    dsl_error_scrub_check_suspend(scn, &zb)) {
 				dsl_dataset_rele(ds, FTAG);
 				return (EFAULT);
 			}
 		}
 		snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 		snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		dsl_dataset_rele(ds, FTAG);
 	}
 	return (0);
 }
 
 void
 dsl_errorscrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	spa_t *spa = dp->dp_spa;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	/*
 	 * Only process scans in sync pass 1.
 	 */
 
 	if (spa_sync_pass(spa) > 1)
 		return;
 
 	/*
 	 * If the spa is shutting down, then stop scanning. This will
 	 * ensure that the scan does not dirty any new data during the
 	 * shutdown phase.
 	 */
 	if (spa_shutting_down(spa))
 		return;
 
 	if (!dsl_errorscrub_active(scn) || dsl_errorscrub_is_paused(scn)) {
 		return;
 	}
 
 	if (dsl_scan_resilvering(scn->scn_dp)) {
 		/* cancel the error scrub if resilver started */
 		dsl_scan_cancel(scn->scn_dp);
 		return;
 	}
 
 	spa->spa_scrub_active = B_TRUE;
 	scn->scn_sync_start_time = gethrtime();
 
 	/*
 	 * zfs_scan_suspend_progress can be set to disable scrub progress.
 	 * See more detailed comment in dsl_scan_sync().
 	 */
 	if (zfs_scan_suspend_progress) {
 		uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
 		int mintime = zfs_scrub_min_time_ms;
 
 		while (zfs_scan_suspend_progress &&
 		    !txg_sync_waiting(scn->scn_dp) &&
 		    !spa_shutting_down(scn->scn_dp->dp_spa) &&
 		    NSEC2MSEC(scan_time_ns) < mintime) {
 			delay(hz);
 			scan_time_ns = gethrtime() - scn->scn_sync_start_time;
 		}
 		return;
 	}
 
 	int i = 0;
 	zap_attribute_t *za;
 	zbookmark_phys_t *zb;
 	boolean_t limit_exceeded = B_FALSE;
 
 	za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
 	zb = kmem_zalloc(sizeof (zbookmark_phys_t), KM_SLEEP);
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0;
 		    zap_cursor_advance(&scn->errorscrub_cursor)) {
 			name_to_bookmark(za->za_name, zb);
 
 			scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 			    NULL, ZIO_FLAG_CANFAIL);
 			dsl_pool_config_enter(dp, FTAG);
 			read_by_block_level(scn, *zb);
 			dsl_pool_config_exit(dp, FTAG);
 
 			(void) zio_wait(scn->scn_zio_root);
 			scn->scn_zio_root = NULL;
 
 			scn->errorscrub_phys.dep_examined += 1;
 			scn->errorscrub_phys.dep_to_examine -= 1;
 			i++;
 			if (i == zfs_scrub_error_blocks_per_txg ||
 			    dsl_error_scrub_check_suspend(scn, zb)) {
 				limit_exceeded = B_TRUE;
 				break;
 			}
 		}
 
 		if (!limit_exceeded)
 			dsl_errorscrub_done(scn, B_TRUE, tx);
 
 		dsl_errorscrub_sync_state(scn, tx);
 		kmem_free(za, sizeof (*za));
 		kmem_free(zb, sizeof (*zb));
 		return;
 	}
 
 	int error = 0;
 	for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0;
 	    zap_cursor_advance(&scn->errorscrub_cursor)) {
 
 		zap_cursor_t *head_ds_cursor;
 		zap_attribute_t *head_ds_attr;
 		zbookmark_err_phys_t head_ds_block;
 
 		head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
 		head_ds_attr = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
 
 		uint64_t head_ds_err_obj = za->za_first_integer;
 		uint64_t head_ds;
 		name_to_object(za->za_name, &head_ds);
 		boolean_t config_held = B_FALSE;
 		uint64_t top_affected_fs;
 
 		for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset,
 		    head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor,
 		    head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) {
 
 			name_to_errphys(head_ds_attr->za_name, &head_ds_block);
 
 			/*
 			 * In case we are called from spa_sync the pool
 			 * config is already held.
 			 */
 			if (!dsl_pool_config_held(dp)) {
 				dsl_pool_config_enter(dp, FTAG);
 				config_held = B_TRUE;
 			}
 
 			error = find_top_affected_fs(spa,
 			    head_ds, &head_ds_block, &top_affected_fs);
 			if (error)
 				break;
 
 			error = scrub_filesystem(spa, top_affected_fs,
 			    &head_ds_block, &i);
 
 			if (error == SET_ERROR(EFAULT)) {
 				limit_exceeded = B_TRUE;
 				break;
 			}
 		}
 
 		zap_cursor_fini(head_ds_cursor);
 		kmem_free(head_ds_cursor, sizeof (*head_ds_cursor));
 		kmem_free(head_ds_attr, sizeof (*head_ds_attr));
 
 		if (config_held)
 			dsl_pool_config_exit(dp, FTAG);
 	}
 
 	kmem_free(za, sizeof (*za));
 	kmem_free(zb, sizeof (*zb));
 	if (!limit_exceeded)
 		dsl_errorscrub_done(scn, B_TRUE, tx);
 
 	dsl_errorscrub_sync_state(scn, tx);
 }
 
 /*
  * This is the primary entry point for scans that is called from syncing
  * context. Scans must happen entirely during syncing context so that we
  * can guarantee that blocks we are currently scanning will not change out
  * from under us. While a scan is active, this function controls how quickly
  * transaction groups proceed, instead of the normal handling provided by
  * txg_sync_thread().
  */
 void
 dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	int err = 0;
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
 	state_sync_type_t sync_type = SYNC_OPTIONAL;
 
 	if (spa->spa_resilver_deferred &&
 	    !spa_feature_is_active(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))
 		spa_feature_incr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
 
 	/*
 	 * Check for scn_restart_txg before checking spa_load_state, so
 	 * that we can restart an old-style scan while the pool is being
 	 * imported (see dsl_scan_init). We also restart scans if there
 	 * is a deferred resilver and the user has manually disabled
 	 * deferred resilvers via the tunable.
 	 */
 	if (dsl_scan_restarting(scn, tx) ||
 	    (spa->spa_resilver_deferred && zfs_resilver_disable_defer)) {
 		pool_scan_func_t func = POOL_SCAN_SCRUB;
 		dsl_scan_done(scn, B_FALSE, tx);
 		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
 			func = POOL_SCAN_RESILVER;
 		zfs_dbgmsg("restarting scan func=%u on %s txg=%llu",
 		    func, dp->dp_spa->spa_name, (longlong_t)tx->tx_txg);
 		dsl_scan_setup_sync(&func, tx);
 	}
 
 	/*
 	 * Only process scans in sync pass 1.
 	 */
 	if (spa_sync_pass(spa) > 1)
 		return;
 
 	/*
 	 * If the spa is shutting down, then stop scanning. This will
 	 * ensure that the scan does not dirty any new data during the
 	 * shutdown phase.
 	 */
 	if (spa_shutting_down(spa))
 		return;
 
 	/*
 	 * If the scan is inactive due to a stalled async destroy, try again.
 	 */
 	if (!scn->scn_async_stalled && !dsl_scan_active(scn))
 		return;
 
 	/* reset scan statistics */
 	scn->scn_visited_this_txg = 0;
 	scn->scn_dedup_frees_this_txg = 0;
 	scn->scn_holes_this_txg = 0;
 	scn->scn_lt_min_this_txg = 0;
 	scn->scn_gt_max_this_txg = 0;
 	scn->scn_ddt_contained_this_txg = 0;
 	scn->scn_objsets_visited_this_txg = 0;
 	scn->scn_avg_seg_size_this_txg = 0;
 	scn->scn_segs_this_txg = 0;
 	scn->scn_avg_zio_size_this_txg = 0;
 	scn->scn_zios_this_txg = 0;
 	scn->scn_suspending = B_FALSE;
 	scn->scn_sync_start_time = gethrtime();
 	spa->spa_scrub_active = B_TRUE;
 
 	/*
 	 * First process the async destroys.  If we suspend, don't do
 	 * any scrubbing or resilvering.  This ensures that there are no
 	 * async destroys while we are scanning, so the scan code doesn't
 	 * have to worry about traversing it.  It is also faster to free the
 	 * blocks than to scrub them.
 	 */
 	err = dsl_process_async_destroys(dp, tx);
 	if (err != 0)
 		return;
 
 	if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
 		return;
 
 	/*
 	 * Wait a few txgs after importing to begin scanning so that
 	 * we can get the pool imported quickly.
 	 */
 	if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS)
 		return;
 
 	/*
 	 * zfs_scan_suspend_progress can be set to disable scan progress.
 	 * We don't want to spin the txg_sync thread, so we add a delay
 	 * here to simulate the time spent doing a scan. This is mostly
 	 * useful for testing and debugging.
 	 */
 	if (zfs_scan_suspend_progress) {
 		uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
 		uint_t mintime = (scn->scn_phys.scn_func ==
 		    POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms :
 		    zfs_scrub_min_time_ms;
 
 		while (zfs_scan_suspend_progress &&
 		    !txg_sync_waiting(scn->scn_dp) &&
 		    !spa_shutting_down(scn->scn_dp->dp_spa) &&
 		    NSEC2MSEC(scan_time_ns) < mintime) {
 			delay(hz);
 			scan_time_ns = gethrtime() - scn->scn_sync_start_time;
 		}
 		return;
 	}
 
 	/*
 	 * Disabled by default, set zfs_scan_report_txgs to report
 	 * average performance over the last zfs_scan_report_txgs TXGs.
 	 */
 	if (zfs_scan_report_txgs != 0 &&
 	    tx->tx_txg % zfs_scan_report_txgs == 0) {
 		scn->scn_issued_before_pass += spa->spa_scan_pass_issued;
 		spa_scan_stat_init(spa);
 	}
 
 	/*
 	 * It is possible to switch from unsorted to sorted at any time,
 	 * but afterwards the scan will remain sorted unless reloaded from
 	 * a checkpoint after a reboot.
 	 */
 	if (!zfs_scan_legacy) {
 		scn->scn_is_sorted = B_TRUE;
 		if (scn->scn_last_checkpoint == 0)
 			scn->scn_last_checkpoint = ddi_get_lbolt();
 	}
 
 	/*
 	 * For sorted scans, determine what kind of work we will be doing
 	 * this txg based on our memory limitations and whether or not we
 	 * need to perform a checkpoint.
 	 */
 	if (scn->scn_is_sorted) {
 		/*
 		 * If we are over our checkpoint interval, set scn_clearing
 		 * so that we can begin checkpointing immediately. The
 		 * checkpoint allows us to save a consistent bookmark
 		 * representing how much data we have scrubbed so far.
 		 * Otherwise, use the memory limit to determine if we should
 		 * scan for metadata or start issue scrub IOs. We accumulate
 		 * metadata until we hit our hard memory limit at which point
 		 * we issue scrub IOs until we are at our soft memory limit.
 		 */
 		if (scn->scn_checkpointing ||
 		    ddi_get_lbolt() - scn->scn_last_checkpoint >
 		    SEC_TO_TICK(zfs_scan_checkpoint_intval)) {
 			if (!scn->scn_checkpointing)
 				zfs_dbgmsg("begin scan checkpoint for %s",
 				    spa->spa_name);
 
 			scn->scn_checkpointing = B_TRUE;
 			scn->scn_clearing = B_TRUE;
 		} else {
 			boolean_t should_clear = dsl_scan_should_clear(scn);
 			if (should_clear && !scn->scn_clearing) {
 				zfs_dbgmsg("begin scan clearing for %s",
 				    spa->spa_name);
 				scn->scn_clearing = B_TRUE;
 			} else if (!should_clear && scn->scn_clearing) {
 				zfs_dbgmsg("finish scan clearing for %s",
 				    spa->spa_name);
 				scn->scn_clearing = B_FALSE;
 			}
 		}
 	} else {
 		ASSERT0(scn->scn_checkpointing);
 		ASSERT0(scn->scn_clearing);
 	}
 
 	if (!scn->scn_clearing && scn->scn_done_txg == 0) {
 		/* Need to scan metadata for more blocks to scrub */
 		dsl_scan_phys_t *scnp = &scn->scn_phys;
 		taskqid_t prefetch_tqid;
 
 		/*
 		 * Calculate the max number of in-flight bytes for pool-wide
 		 * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
 		 * Limits for the issuing phase are done per top-level vdev and
 		 * are handled separately.
 		 */
 		scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
 		    zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));
 
 		if (scnp->scn_ddt_bookmark.ddb_class <=
 		    scnp->scn_ddt_class_max) {
 			ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark));
 			zfs_dbgmsg("doing scan sync for %s txg %llu; "
 			    "ddt bm=%llu/%llu/%llu/%llx",
 			    spa->spa_name,
 			    (longlong_t)tx->tx_txg,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
 		} else {
 			zfs_dbgmsg("doing scan sync for %s txg %llu; "
 			    "bm=%llu/%llu/%llu/%llu",
 			    spa->spa_name,
 			    (longlong_t)tx->tx_txg,
 			    (longlong_t)scnp->scn_bookmark.zb_objset,
 			    (longlong_t)scnp->scn_bookmark.zb_object,
 			    (longlong_t)scnp->scn_bookmark.zb_level,
 			    (longlong_t)scnp->scn_bookmark.zb_blkid);
 		}
 
 		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 		    NULL, ZIO_FLAG_CANFAIL);
 
 		scn->scn_prefetch_stop = B_FALSE;
 		prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq,
 		    dsl_scan_prefetch_thread, scn, TQ_SLEEP);
 		ASSERT(prefetch_tqid != TASKQID_INVALID);
 
 		dsl_pool_config_enter(dp, FTAG);
 		dsl_scan_visit(scn, tx);
 		dsl_pool_config_exit(dp, FTAG);
 
 		mutex_enter(&dp->dp_spa->spa_scrub_lock);
 		scn->scn_prefetch_stop = B_TRUE;
 		cv_broadcast(&spa->spa_scrub_io_cv);
 		mutex_exit(&dp->dp_spa->spa_scrub_lock);
 
 		taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid);
 		(void) zio_wait(scn->scn_zio_root);
 		scn->scn_zio_root = NULL;
 
 		zfs_dbgmsg("scan visited %llu blocks of %s in %llums "
 		    "(%llu os's, %llu holes, %llu < mintxg, "
 		    "%llu in ddt, %llu > maxtxg)",
 		    (longlong_t)scn->scn_visited_this_txg,
 		    spa->spa_name,
 		    (longlong_t)NSEC2MSEC(gethrtime() -
 		    scn->scn_sync_start_time),
 		    (longlong_t)scn->scn_objsets_visited_this_txg,
 		    (longlong_t)scn->scn_holes_this_txg,
 		    (longlong_t)scn->scn_lt_min_this_txg,
 		    (longlong_t)scn->scn_ddt_contained_this_txg,
 		    (longlong_t)scn->scn_gt_max_this_txg);
 
 		if (!scn->scn_suspending) {
 			ASSERT0(avl_numnodes(&scn->scn_queue));
 			scn->scn_done_txg = tx->tx_txg + 1;
 			if (scn->scn_is_sorted) {
 				scn->scn_checkpointing = B_TRUE;
 				scn->scn_clearing = B_TRUE;
 				scn->scn_issued_before_pass +=
 				    spa->spa_scan_pass_issued;
 				spa_scan_stat_init(spa);
 			}
 			zfs_dbgmsg("scan complete for %s txg %llu",
 			    spa->spa_name,
 			    (longlong_t)tx->tx_txg);
 		}
 	} else if (scn->scn_is_sorted && scn->scn_queues_pending != 0) {
 		ASSERT(scn->scn_clearing);
 
 		/* need to issue scrubbing IOs from per-vdev queues */
 		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 		    NULL, ZIO_FLAG_CANFAIL);
 		scan_io_queues_run(scn);
 		(void) zio_wait(scn->scn_zio_root);
 		scn->scn_zio_root = NULL;
 
 		/* calculate and dprintf the current memory usage */
 		(void) dsl_scan_should_clear(scn);
 		dsl_scan_update_stats(scn);
 
 		zfs_dbgmsg("scan issued %llu blocks for %s (%llu segs) "
 		    "in %llums (avg_block_size = %llu, avg_seg_size = %llu)",
 		    (longlong_t)scn->scn_zios_this_txg,
 		    spa->spa_name,
 		    (longlong_t)scn->scn_segs_this_txg,
 		    (longlong_t)NSEC2MSEC(gethrtime() -
 		    scn->scn_sync_start_time),
 		    (longlong_t)scn->scn_avg_zio_size_this_txg,
 		    (longlong_t)scn->scn_avg_seg_size_this_txg);
 	} else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) {
 		/* Finished with everything. Mark the scrub as complete */
 		zfs_dbgmsg("scan issuing complete txg %llu for %s",
 		    (longlong_t)tx->tx_txg,
 		    spa->spa_name);
 		ASSERT3U(scn->scn_done_txg, !=, 0);
 		ASSERT0(spa->spa_scrub_inflight);
 		ASSERT0(scn->scn_queues_pending);
 		dsl_scan_done(scn, B_TRUE, tx);
 		sync_type = SYNC_MANDATORY;
 	}
 
 	dsl_scan_sync_state(scn, tx, sync_type);
 }
 
 static void
 count_block_issued(spa_t *spa, const blkptr_t *bp, boolean_t all)
 {
 	/*
 	 * Don't count embedded bp's, since we already did the work of
 	 * scanning these when we scanned the containing block.
 	 */
 	if (BP_IS_EMBEDDED(bp))
 		return;
 
 	/*
 	 * Update the spa's stats on how many bytes we have issued.
 	 * Sequential scrubs create a zio for each DVA of the bp. Each
 	 * of these will include all DVAs for repair purposes, but the
 	 * zio code will only try the first one unless there is an issue.
 	 * Therefore, we should only count the first DVA for these IOs.
 	 */
 	atomic_add_64(&spa->spa_scan_pass_issued,
 	    all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
 }
 
 static void
 count_block_skipped(dsl_scan_t *scn, const blkptr_t *bp, boolean_t all)
 {
 	if (BP_IS_EMBEDDED(bp))
 		return;
 	atomic_add_64(&scn->scn_phys.scn_skipped,
 	    all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
 }
 
 static void
 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
 {
 	/*
 	 * If we resume after a reboot, zab will be NULL; don't record
 	 * incomplete stats in that case.
 	 */
 	if (zab == NULL)
 		return;
 
 	for (int i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
 		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
 
 		if (t & DMU_OT_NEWTYPE)
 			t = DMU_OT_OTHER;
 		zfs_blkstat_t *zb = &zab->zab_type[l][t];
 		int equal;
 
 		zb->zb_count++;
 		zb->zb_asize += BP_GET_ASIZE(bp);
 		zb->zb_lsize += BP_GET_LSIZE(bp);
 		zb->zb_psize += BP_GET_PSIZE(bp);
 		zb->zb_gangs += BP_COUNT_GANG(bp);
 
 		switch (BP_GET_NDVAS(bp)) {
 		case 2:
 			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1]))
 				zb->zb_ditto_2_of_2_samevdev++;
 			break;
 		case 3:
 			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2]));
 			if (equal == 1)
 				zb->zb_ditto_2_of_3_samevdev++;
 			else if (equal == 3)
 				zb->zb_ditto_3_of_3_samevdev++;
 			break;
 		}
 	}
 }
 
 static void
 scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
 {
 	avl_index_t idx;
 	dsl_scan_t *scn = queue->q_scn;
 
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
 	if (unlikely(avl_is_empty(&queue->q_sios_by_addr)))
 		atomic_add_64(&scn->scn_queues_pending, 1);
 	if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
 		/* block is already scheduled for reading */
 		sio_free(sio);
 		return;
 	}
 	avl_insert(&queue->q_sios_by_addr, sio, idx);
 	queue->q_sio_memused += SIO_GET_MUSED(sio);
 	range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio),
 	    SIO_GET_ASIZE(sio));
 }
 
 /*
  * Given all the info we got from our metadata scanning process, we
  * construct a scan_io_t and insert it into the scan sorting queue. The
  * I/O must already be suitable for us to process. This is controlled
  * by dsl_scan_enqueue().
  */
 static void
 scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
     int zio_flags, const zbookmark_phys_t *zb)
 {
 	scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp));
 
 	ASSERT0(BP_IS_GANG(bp));
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
 	bp2sio(bp, sio, dva_i);
 	sio->sio_flags = zio_flags;
 	sio->sio_zb = *zb;
 
 	queue->q_last_ext_addr = -1;
 	scan_io_queue_insert_impl(queue, sio);
 }
 
 /*
  * Given a set of I/O parameters as discovered by the metadata traversal
  * process, attempts to place the I/O into the sorted queues (if allowed),
  * or immediately executes the I/O.
  */
 static void
 dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
     const zbookmark_phys_t *zb)
 {
 	spa_t *spa = dp->dp_spa;
 
 	ASSERT(!BP_IS_EMBEDDED(bp));
 
 	/*
 	 * Gang blocks are hard to issue sequentially, so we just issue them
 	 * here immediately instead of queuing them.
 	 */
 	if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) {
 		scan_exec_io(dp, bp, zio_flags, zb, NULL);
 		return;
 	}
 
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		dva_t dva;
 		vdev_t *vdev;
 
 		dva = bp->blk_dva[i];
 		vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva));
 		ASSERT(vdev != NULL);
 
 		mutex_enter(&vdev->vdev_scan_io_queue_lock);
 		if (vdev->vdev_scan_io_queue == NULL)
 			vdev->vdev_scan_io_queue = scan_io_queue_create(vdev);
 		ASSERT(dp->dp_scan != NULL);
 		scan_io_queue_insert(vdev->vdev_scan_io_queue, bp,
 		    i, zio_flags, zb);
 		mutex_exit(&vdev->vdev_scan_io_queue_lock);
 	}
 }
 
 static int
 dsl_scan_scrub_cb(dsl_pool_t *dp,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
-	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+	uint64_t phys_birth = BP_GET_BIRTH(bp);
 	size_t psize = BP_GET_PSIZE(bp);
 	boolean_t needs_io = B_FALSE;
 	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
 
 	count_block(dp->dp_blkstats, bp);
 	if (phys_birth <= scn->scn_phys.scn_min_txg ||
 	    phys_birth >= scn->scn_phys.scn_max_txg) {
 		count_block_skipped(scn, bp, B_TRUE);
 		return (0);
 	}
 
 	/* Embedded BP's have phys_birth==0, so we reject them above. */
 	ASSERT(!BP_IS_EMBEDDED(bp));
 
 	ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
 	if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
 		zio_flags |= ZIO_FLAG_SCRUB;
 		needs_io = B_TRUE;
 	} else {
 		ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
 		zio_flags |= ZIO_FLAG_RESILVER;
 		needs_io = B_FALSE;
 	}
 
 	/* If it's an intent log block, failure is expected. */
 	if (zb->zb_level == ZB_ZIL_LEVEL)
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
 		const dva_t *dva = &bp->blk_dva[d];
 
 		/*
 		 * Keep track of how much data we've examined so that
 		 * zpool(8) status can make useful progress reports.
 		 */
 		uint64_t asize = DVA_GET_ASIZE(dva);
 		scn->scn_phys.scn_examined += asize;
 		spa->spa_scan_pass_exam += asize;
 
 		/* if it's a resilver, this may not be in the target range */
 		if (!needs_io)
 			needs_io = dsl_scan_need_resilver(spa, dva, psize,
 			    phys_birth);
 	}
 
 	if (needs_io && !zfs_no_scrub_io) {
 		dsl_scan_enqueue(dp, bp, zio_flags, zb);
 	} else {
 		count_block_skipped(scn, bp, B_TRUE);
 	}
 
 	/* do not relocate this block */
 	return (0);
 }
 
 static void
 dsl_scan_scrub_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	dsl_scan_io_queue_t *queue = zio->io_private;
 
 	abd_free(zio->io_abd);
 
 	if (queue == NULL) {
 		mutex_enter(&spa->spa_scrub_lock);
 		ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
 		spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
 		cv_broadcast(&spa->spa_scrub_io_cv);
 		mutex_exit(&spa->spa_scrub_lock);
 	} else {
 		mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock);
 		ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp));
 		queue->q_inflight_bytes -= BP_GET_PSIZE(bp);
 		cv_broadcast(&queue->q_zio_cv);
 		mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock);
 	}
 
 	if (zio->io_error && (zio->io_error != ECKSUM ||
 	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
 		if (dsl_errorscrubbing(spa->spa_dsl_pool) &&
 		    !dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) {
 			atomic_inc_64(&spa->spa_dsl_pool->dp_scan
 			    ->errorscrub_phys.dep_errors);
 		} else {
 			atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys
 			    .scn_errors);
 		}
 	}
 }
 
 /*
  * Given a scanning zio's information, executes the zio. The zio need
  * not necessarily be only sortable, this function simply executes the
  * zio, no matter what it is. The optional queue argument allows the
  * caller to specify that they want per top level vdev IO rate limiting
  * instead of the legacy global limiting.
  */
 static void
 scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
     const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue)
 {
 	spa_t *spa = dp->dp_spa;
 	dsl_scan_t *scn = dp->dp_scan;
 	size_t size = BP_GET_PSIZE(bp);
 	abd_t *data = abd_alloc_for_io(size, B_FALSE);
 	zio_t *pio;
 
 	if (queue == NULL) {
 		ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 		spa->spa_scrub_inflight += BP_GET_PSIZE(bp);
 		mutex_exit(&spa->spa_scrub_lock);
 		pio = scn->scn_zio_root;
 	} else {
 		kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
 
 		ASSERT3U(queue->q_maxinflight_bytes, >, 0);
 		mutex_enter(q_lock);
 		while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
 			cv_wait(&queue->q_zio_cv, q_lock);
 		queue->q_inflight_bytes += BP_GET_PSIZE(bp);
 		pio = queue->q_zio;
 		mutex_exit(q_lock);
 	}
 
 	ASSERT(pio != NULL);
 	count_block_issued(spa, bp, queue == NULL);
 	zio_nowait(zio_read(pio, spa, bp, data, size, dsl_scan_scrub_done,
 	    queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
 }
 
 /*
  * This is the primary extent sorting algorithm. We balance two parameters:
  * 1) how many bytes of I/O are in an extent
  * 2) how well the extent is filled with I/O (as a fraction of its total size)
  * Since we allow extents to have gaps between their constituent I/Os, it's
  * possible to have a fairly large extent that contains the same amount of
  * I/O bytes than a much smaller extent, which just packs the I/O more tightly.
  * The algorithm sorts based on a score calculated from the extent's size,
  * the relative fill volume (in %) and a "fill weight" parameter that controls
  * the split between whether we prefer larger extents or more well populated
  * extents:
  *
  * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT)
  *
  * Example:
  * 1) assume extsz = 64 MiB
  * 2) assume fill = 32 MiB (extent is half full)
  * 3) assume fill_weight = 3
  * 4)	SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100
  *	SCORE = 32M + (50 * 3 * 32M) / 100
  *	SCORE = 32M + (4800M / 100)
  *	SCORE = 32M + 48M
  *	         ^     ^
  *	         |     +--- final total relative fill-based score
  *	         +--------- final total fill-based score
  *	SCORE = 80M
  *
  * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards
  * extents that are more completely filled (in a 3:2 ratio) vs just larger.
  * Note that as an optimization, we replace multiplication and division by
  * 100 with bitshifting by 7 (which effectively multiplies and divides by 128).
  *
  * Since we do not care if one extent is only few percent better than another,
  * compress the score into 6 bits via binary logarithm AKA highbit64() and
  * put into otherwise unused due to ashift high bits of offset.  This allows
  * to reduce q_exts_by_size B-tree elements to only 64 bits and compare them
  * with single operation.  Plus it makes scrubs more sequential and reduces
  * chances that minor extent change move it within the B-tree.
  */
 __attribute__((always_inline)) inline
 static int
 ext_size_compare(const void *x, const void *y)
 {
 	const uint64_t *a = x, *b = y;
 
 	return (TREE_CMP(*a, *b));
 }
 
 ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t,
     ext_size_compare)
 
 static void
 ext_size_create(range_tree_t *rt, void *arg)
 {
 	(void) rt;
 	zfs_btree_t *size_tree = arg;
 
 	zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf,
 	    sizeof (uint64_t));
 }
 
 static void
 ext_size_destroy(range_tree_t *rt, void *arg)
 {
 	(void) rt;
 	zfs_btree_t *size_tree = arg;
 	ASSERT0(zfs_btree_numnodes(size_tree));
 
 	zfs_btree_destroy(size_tree);
 }
 
 static uint64_t
 ext_size_value(range_tree_t *rt, range_seg_gap_t *rsg)
 {
 	(void) rt;
 	uint64_t size = rsg->rs_end - rsg->rs_start;
 	uint64_t score = rsg->rs_fill + ((((rsg->rs_fill << 7) / size) *
 	    fill_weight * rsg->rs_fill) >> 7);
 	ASSERT3U(rt->rt_shift, >=, 8);
 	return (((uint64_t)(64 - highbit64(score)) << 56) | rsg->rs_start);
 }
 
 static void
 ext_size_add(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
 	zfs_btree_t *size_tree = arg;
 	ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP);
 	uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs);
 	zfs_btree_add(size_tree, &v);
 }
 
 static void
 ext_size_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
 	zfs_btree_t *size_tree = arg;
 	ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP);
 	uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs);
 	zfs_btree_remove(size_tree, &v);
 }
 
 static void
 ext_size_vacate(range_tree_t *rt, void *arg)
 {
 	zfs_btree_t *size_tree = arg;
 	zfs_btree_clear(size_tree);
 	zfs_btree_destroy(size_tree);
 
 	ext_size_create(rt, arg);
 }
 
 static const range_tree_ops_t ext_size_ops = {
 	.rtop_create = ext_size_create,
 	.rtop_destroy = ext_size_destroy,
 	.rtop_add = ext_size_add,
 	.rtop_remove = ext_size_remove,
 	.rtop_vacate = ext_size_vacate
 };
 
 /*
  * Comparator for the q_sios_by_addr tree. Sorting is simply performed
  * based on LBA-order (from lowest to highest).
  */
 static int
 sio_addr_compare(const void *x, const void *y)
 {
 	const scan_io_t *a = x, *b = y;
 
 	return (TREE_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b)));
 }
 
 /* IO queues are created on demand when they are needed. */
 static dsl_scan_io_queue_t *
 scan_io_queue_create(vdev_t *vd)
 {
 	dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
 	dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP);
 
 	q->q_scn = scn;
 	q->q_vd = vd;
 	q->q_sio_memused = 0;
 	q->q_last_ext_addr = -1;
 	cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
 	q->q_exts_by_addr = range_tree_create_gap(&ext_size_ops, RANGE_SEG_GAP,
 	    &q->q_exts_by_size, 0, vd->vdev_ashift, zfs_scan_max_ext_gap);
 	avl_create(&q->q_sios_by_addr, sio_addr_compare,
 	    sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
 
 	return (q);
 }
 
 /*
  * Destroys a scan queue and all segments and scan_io_t's contained in it.
  * No further execution of I/O occurs, anything pending in the queue is
  * simply freed without being executed.
  */
 void
 dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
 {
 	dsl_scan_t *scn = queue->q_scn;
 	scan_io_t *sio;
 	void *cookie = NULL;
 
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
 	if (!avl_is_empty(&queue->q_sios_by_addr))
 		atomic_add_64(&scn->scn_queues_pending, -1);
 	while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
 	    NULL) {
 		ASSERT(range_tree_contains(queue->q_exts_by_addr,
 		    SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)));
 		queue->q_sio_memused -= SIO_GET_MUSED(sio);
 		sio_free(sio);
 	}
 
 	ASSERT0(queue->q_sio_memused);
 	range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
 	range_tree_destroy(queue->q_exts_by_addr);
 	avl_destroy(&queue->q_sios_by_addr);
 	cv_destroy(&queue->q_zio_cv);
 
 	kmem_free(queue, sizeof (*queue));
 }
 
 /*
  * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is
  * called on behalf of vdev_top_transfer when creating or destroying
  * a mirror vdev due to zpool attach/detach.
  */
 void
 dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd)
 {
 	mutex_enter(&svd->vdev_scan_io_queue_lock);
 	mutex_enter(&tvd->vdev_scan_io_queue_lock);
 
 	VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL);
 	tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue;
 	svd->vdev_scan_io_queue = NULL;
 	if (tvd->vdev_scan_io_queue != NULL)
 		tvd->vdev_scan_io_queue->q_vd = tvd;
 
 	mutex_exit(&tvd->vdev_scan_io_queue_lock);
 	mutex_exit(&svd->vdev_scan_io_queue_lock);
 }
 
 static void
 scan_io_queues_destroy(dsl_scan_t *scn)
 {
 	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
 
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *tvd = rvd->vdev_child[i];
 
 		mutex_enter(&tvd->vdev_scan_io_queue_lock);
 		if (tvd->vdev_scan_io_queue != NULL)
 			dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue);
 		tvd->vdev_scan_io_queue = NULL;
 		mutex_exit(&tvd->vdev_scan_io_queue_lock);
 	}
 }
 
 static void
 dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	vdev_t *vdev;
 	kmutex_t *q_lock;
 	dsl_scan_io_queue_t *queue;
 	scan_io_t *srch_sio, *sio;
 	avl_index_t idx;
 	uint64_t start, size;
 
 	vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i]));
 	ASSERT(vdev != NULL);
 	q_lock = &vdev->vdev_scan_io_queue_lock;
 	queue = vdev->vdev_scan_io_queue;
 
 	mutex_enter(q_lock);
 	if (queue == NULL) {
 		mutex_exit(q_lock);
 		return;
 	}
 
 	srch_sio = sio_alloc(BP_GET_NDVAS(bp));
 	bp2sio(bp, srch_sio, dva_i);
 	start = SIO_GET_OFFSET(srch_sio);
 	size = SIO_GET_ASIZE(srch_sio);
 
 	/*
 	 * We can find the zio in two states:
 	 * 1) Cold, just sitting in the queue of zio's to be issued at
 	 *	some point in the future. In this case, all we do is
 	 *	remove the zio from the q_sios_by_addr tree, decrement
 	 *	its data volume from the containing range_seg_t and
 	 *	resort the q_exts_by_size tree to reflect that the
 	 *	range_seg_t has lost some of its 'fill'. We don't shorten
 	 *	the range_seg_t - this is usually rare enough not to be
 	 *	worth the extra hassle of trying keep track of precise
 	 *	extent boundaries.
 	 * 2) Hot, where the zio is currently in-flight in
 	 *	dsl_scan_issue_ios. In this case, we can't simply
 	 *	reach in and stop the in-flight zio's, so we instead
 	 *	block the caller. Eventually, dsl_scan_issue_ios will
 	 *	be done with issuing the zio's it gathered and will
 	 *	signal us.
 	 */
 	sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
 	sio_free(srch_sio);
 
 	if (sio != NULL) {
 		blkptr_t tmpbp;
 
 		/* Got it while it was cold in the queue */
 		ASSERT3U(start, ==, SIO_GET_OFFSET(sio));
 		ASSERT3U(size, ==, SIO_GET_ASIZE(sio));
 		avl_remove(&queue->q_sios_by_addr, sio);
 		if (avl_is_empty(&queue->q_sios_by_addr))
 			atomic_add_64(&scn->scn_queues_pending, -1);
 		queue->q_sio_memused -= SIO_GET_MUSED(sio);
 
 		ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
 		range_tree_remove_fill(queue->q_exts_by_addr, start, size);
 
 		/* count the block as though we skipped it */
 		sio2bp(sio, &tmpbp);
 		count_block_skipped(scn, &tmpbp, B_FALSE);
 
 		sio_free(sio);
 	}
 	mutex_exit(q_lock);
 }
 
 /*
  * Callback invoked when a zio_free() zio is executing. This needs to be
  * intercepted to prevent the zio from deallocating a particular portion
  * of disk space and it then getting reallocated and written to, while we
  * still have it queued up for processing.
  */
 void
 dsl_scan_freed(spa_t *spa, const blkptr_t *bp)
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	ASSERT(!BP_IS_EMBEDDED(bp));
 	ASSERT(scn != NULL);
 	if (!dsl_scan_is_running(scn))
 		return;
 
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++)
 		dsl_scan_freed_dva(spa, bp, i);
 }
 
 /*
  * Check if a vdev needs resilvering (non-empty DTL), if so, and resilver has
  * not started, start it. Otherwise, only restart if max txg in DTL range is
  * greater than the max txg in the current scan. If the DTL max is less than
  * the scan max, then the vdev has not missed any new data since the resilver
  * started, so a restart is not needed.
  */
 void
 dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd)
 {
 	uint64_t min, max;
 
 	if (!vdev_resilver_needed(vd, &min, &max))
 		return;
 
 	if (!dsl_scan_resilvering(dp)) {
 		spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER);
 		return;
 	}
 
 	if (max <= dp->dp_scan->scn_phys.scn_max_txg)
 		return;
 
 	/* restart is needed, check if it can be deferred */
 	if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))
 		vdev_defer_resilver(vd);
 	else
 		spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER);
 }
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, U64, ZMOD_RW,
 	"Max bytes in flight per leaf vdev for scrubs and resilvers");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, UINT, ZMOD_RW,
 	"Min millisecs to scrub per txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, UINT, ZMOD_RW,
 	"Min millisecs to obsolete per txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, UINT, ZMOD_RW,
 	"Min millisecs to free per txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, UINT, ZMOD_RW,
 	"Min millisecs to resilver per txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_suspend_progress, INT, ZMOD_RW,
 	"Set to prevent scans from progressing");
 
 ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_io, INT, ZMOD_RW,
 	"Set to disable scrub I/O");
 
 ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_prefetch, INT, ZMOD_RW,
 	"Set to disable scrub prefetching");
 
 ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, U64, ZMOD_RW,
 	"Max number of blocks freed in one txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, U64, ZMOD_RW,
 	"Max number of dedup blocks freed in one txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW,
 	"Enable processing of the free_bpobj");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_blkstats, INT, ZMOD_RW,
 	"Enable block statistics calculation during scrub");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, UINT, ZMOD_RW,
 	"Fraction of RAM for scan hard limit");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, UINT, ZMOD_RW,
 	"IO issuing strategy during scrubbing. 0 = default, 1 = LBA, 2 = size");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_legacy, INT, ZMOD_RW,
 	"Scrub using legacy non-sequential method");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, UINT, ZMOD_RW,
 	"Scan progress on-disk checkpointing interval");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, U64, ZMOD_RW,
 	"Max gap in bytes between sequential scrub / resilver I/Os");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, UINT, ZMOD_RW,
 	"Fraction of hard limit used as soft limit");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW,
 	"Tunable to attempt to reduce lock contention");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, UINT, ZMOD_RW,
 	"Tunable to adjust bias towards more filled segments during scans");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW,
 	"Tunable to report resilver performance over the last N txgs");
 
 ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW,
 	"Process all resilvers immediately");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, UINT, ZMOD_RW,
 	"Error blocks to be scrubbed in one txg");
 /* END CSTYLED */
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 7237fa8eeb59..c4aa98ced433 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -1,6283 +1,6284 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/space_map.h>
 #include <sys/metaslab_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/zio.h>
 #include <sys/spa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/zap.h>
 #include <sys/btree.h>
 
 #define	GANG_ALLOCATION(flags) \
 	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
 
 /*
  * Metaslab granularity, in bytes. This is roughly similar to what would be
  * referred to as the "stripe size" in traditional RAID arrays. In normal
  * operation, we will try to write this amount of data to each disk before
  * moving on to the next top-level vdev.
  */
 static uint64_t metaslab_aliquot = 1024 * 1024;
 
 /*
  * For testing, make some blocks above a certain size be gang blocks.
  */
 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
 
 /*
  * Of blocks of size >= metaslab_force_ganging, actually gang them this often.
  */
 uint_t metaslab_force_ganging_pct = 3;
 
 /*
  * In pools where the log space map feature is not enabled we touch
  * multiple metaslabs (and their respective space maps) with each
  * transaction group. Thus, we benefit from having a small space map
  * block size since it allows us to issue more I/O operations scattered
  * around the disk. So a sane default for the space map block size
  * is 8~16K.
  */
 int zfs_metaslab_sm_blksz_no_log = (1 << 14);
 
 /*
  * When the log space map feature is enabled, we accumulate a lot of
  * changes per metaslab that are flushed once in a while so we benefit
  * from a bigger block size like 128K for the metaslab space maps.
  */
 int zfs_metaslab_sm_blksz_with_log = (1 << 17);
 
 /*
  * The in-core space map representation is more compact than its on-disk form.
  * The zfs_condense_pct determines how much more compact the in-core
  * space map representation must be before we compact it on-disk.
  * Values should be greater than or equal to 100.
  */
 uint_t zfs_condense_pct = 200;
 
 /*
  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  * space used on disk. In particular, a space map uses data in increments of
  * MAX(1 << ashift, space_map_blksz), so a metaslab might use the
  * same number of blocks after condensing. Since the goal of condensing is to
  * reduce the number of IOPs required to read the space map, we only want to
  * condense when we can be sure we will reduce the number of blocks used by the
  * space map. Unfortunately, we cannot precisely compute whether or not this is
  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
  * we apply the following heuristic: do not condense a spacemap unless the
  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
  * blocks.
  */
 static const int zfs_metaslab_condense_block_threshold = 4;
 
 /*
  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  * be eligible for allocation. The value is defined as a percentage of
  * free space. Metaslab groups that have more free space than
  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  * a metaslab group's free space is less than or equal to the
  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  * groups are allowed to accept allocations. Gang blocks are always
  * eligible to allocate on any metaslab group. The default value of 0 means
  * no metaslab group will be excluded based on this criterion.
  */
 static uint_t zfs_mg_noalloc_threshold = 0;
 
 /*
  * Metaslab groups are considered eligible for allocations if their
  * fragmentation metric (measured as a percentage) is less than or
  * equal to zfs_mg_fragmentation_threshold. If a metaslab group
  * exceeds this threshold then it will be skipped unless all metaslab
  * groups within the metaslab class have also crossed this threshold.
  *
  * This tunable was introduced to avoid edge cases where we continue
  * allocating from very fragmented disks in our pool while other, less
  * fragmented disks, exists. On the other hand, if all disks in the
  * pool are uniformly approaching the threshold, the threshold can
  * be a speed bump in performance, where we keep switching the disks
  * that we allocate from (e.g. we allocate some segments from disk A
  * making it bypassing the threshold while freeing segments from disk
  * B getting its fragmentation below the threshold).
  *
  * Empirically, we've seen that our vdev selection for allocations is
  * good enough that fragmentation increases uniformly across all vdevs
  * the majority of the time. Thus we set the threshold percentage high
  * enough to avoid hitting the speed bump on pools that are being pushed
  * to the edge.
  */
 static uint_t zfs_mg_fragmentation_threshold = 95;
 
 /*
  * Allow metaslabs to keep their active state as long as their fragmentation
  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
  * active metaslab that exceeds this threshold will no longer keep its active
  * status allowing better metaslabs to be selected.
  */
 static uint_t zfs_metaslab_fragmentation_threshold = 70;
 
 /*
  * When set will load all metaslabs when pool is first opened.
  */
 int metaslab_debug_load = B_FALSE;
 
 /*
  * When set will prevent metaslabs from being unloaded.
  */
 static int metaslab_debug_unload = B_FALSE;
 
 /*
  * Minimum size which forces the dynamic allocator to change
  * it's allocation strategy.  Once the space map cannot satisfy
  * an allocation of this size then it switches to using more
  * aggressive strategy (i.e search by size rather than offset).
  */
 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 
 /*
  * The minimum free space, in percent, which must be available
  * in a space map to continue allocations in a first-fit fashion.
  * Once the space map's free space drops below this level we dynamically
  * switch to using best-fit allocations.
  */
 uint_t metaslab_df_free_pct = 4;
 
 /*
  * Maximum distance to search forward from the last offset. Without this
  * limit, fragmented pools can see >100,000 iterations and
  * metaslab_block_picker() becomes the performance limiting factor on
  * high-performance storage.
  *
  * With the default setting of 16MB, we typically see less than 500
  * iterations, even with very fragmented, ashift=9 pools. The maximum number
  * of iterations possible is:
  *     metaslab_df_max_search / (2 * (1<<ashift))
  * With the default setting of 16MB this is 16*1024 (with ashift=9) or
  * 2048 (with ashift=12).
  */
 static uint_t metaslab_df_max_search = 16 * 1024 * 1024;
 
 /*
  * Forces the metaslab_block_picker function to search for at least this many
  * segments forwards until giving up on finding a segment that the allocation
  * will fit into.
  */
 static const uint32_t metaslab_min_search_count = 100;
 
 /*
  * If we are not searching forward (due to metaslab_df_max_search,
  * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
  * controls what segment is used.  If it is set, we will use the largest free
  * segment.  If it is not set, we will use a segment of exactly the requested
  * size (or larger).
  */
 static int metaslab_df_use_largest_segment = B_FALSE;
 
 /*
  * These tunables control how long a metaslab will remain loaded after the
  * last allocation from it.  A metaslab can't be unloaded until at least
  * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds
  * have elapsed.  However, zfs_metaslab_mem_limit may cause it to be
  * unloaded sooner.  These settings are intended to be generous -- to keep
  * metaslabs loaded for a long time, reducing the rate of metaslab loading.
  */
 static uint_t metaslab_unload_delay = 32;
 static uint_t metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
 
 /*
  * Max number of metaslabs per group to preload.
  */
 uint_t metaslab_preload_limit = 10;
 
 /*
  * Enable/disable preloading of metaslab.
  */
 static int metaslab_preload_enabled = B_TRUE;
 
 /*
  * Enable/disable fragmentation weighting on metaslabs.
  */
 static int metaslab_fragmentation_factor_enabled = B_TRUE;
 
 /*
  * Enable/disable lba weighting (i.e. outer tracks are given preference).
  */
 static int metaslab_lba_weighting_enabled = B_TRUE;
 
 /*
  * Enable/disable metaslab group biasing.
  */
 static int metaslab_bias_enabled = B_TRUE;
 
 /*
  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
  */
 static const boolean_t zfs_remap_blkptr_enable = B_TRUE;
 
 /*
  * Enable/disable segment-based metaslab selection.
  */
 static int zfs_metaslab_segment_weight_enabled = B_TRUE;
 
 /*
  * When using segment-based metaslab selection, we will continue
  * allocating from the active metaslab until we have exhausted
  * zfs_metaslab_switch_threshold of its buckets.
  */
 static int zfs_metaslab_switch_threshold = 2;
 
 /*
  * Internal switch to enable/disable the metaslab allocation tracing
  * facility.
  */
 static const boolean_t metaslab_trace_enabled = B_FALSE;
 
 /*
  * Maximum entries that the metaslab allocation tracing facility will keep
  * in a given list when running in non-debug mode. We limit the number
  * of entries in non-debug mode to prevent us from using up too much memory.
  * The limit should be sufficiently large that we don't expect any allocation
  * to every exceed this value. In debug mode, the system will panic if this
  * limit is ever reached allowing for further investigation.
  */
 static const uint64_t metaslab_trace_max_entries = 5000;
 
 /*
  * Maximum number of metaslabs per group that can be disabled
  * simultaneously.
  */
 static const int max_disabled_ms = 3;
 
 /*
  * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
  * To avoid 64-bit overflow, don't set above UINT32_MAX.
  */
 static uint64_t zfs_metaslab_max_size_cache_sec = 1 * 60 * 60; /* 1 hour */
 
 /*
  * Maximum percentage of memory to use on storing loaded metaslabs. If loading
  * a metaslab would take it over this percentage, the oldest selected metaslab
  * is automatically unloaded.
  */
 static uint_t zfs_metaslab_mem_limit = 25;
 
 /*
  * Force the per-metaslab range trees to use 64-bit integers to store
  * segments. Used for debugging purposes.
  */
 static const boolean_t zfs_metaslab_force_large_segs = B_FALSE;
 
 /*
  * By default we only store segments over a certain size in the size-sorted
  * metaslab trees (ms_allocatable_by_size and
  * ms_unflushed_frees_by_size). This dramatically reduces memory usage and
  * improves load and unload times at the cost of causing us to use slightly
  * larger segments than we would otherwise in some cases.
  */
 static const uint32_t metaslab_by_size_min_shift = 14;
 
 /*
  * If not set, we will first try normal allocation.  If that fails then
  * we will do a gang allocation.  If that fails then we will do a "try hard"
  * gang allocation.  If that fails then we will have a multi-layer gang
  * block.
  *
  * If set, we will first try normal allocation.  If that fails then
  * we will do a "try hard" allocation.  If that fails we will do a gang
  * allocation.  If that fails we will do a "try hard" gang allocation.  If
  * that fails then we will have a multi-layer gang block.
  */
 static int zfs_metaslab_try_hard_before_gang = B_FALSE;
 
 /*
  * When not trying hard, we only consider the best zfs_metaslab_find_max_tries
  * metaslabs.  This improves performance, especially when there are many
  * metaslabs per vdev and the allocation can't actually be satisfied (so we
  * would otherwise iterate all the metaslabs).  If there is a metaslab with a
  * worse weight but it can actually satisfy the allocation, we won't find it
  * until trying hard.  This may happen if the worse metaslab is not loaded
  * (and the true weight is better than we have calculated), or due to weight
  * bucketization.  E.g. we are looking for a 60K segment, and the best
  * metaslabs all have free segments in the 32-63K bucket, but the best
  * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a
  * subsequent metaslab has ms_max_size >60KB (but fewer segments in this
  * bucket, and therefore a lower weight).
  */
 static uint_t zfs_metaslab_find_max_tries = 100;
 
 static uint64_t metaslab_weight(metaslab_t *, boolean_t);
 static void metaslab_set_fragmentation(metaslab_t *, boolean_t);
 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 
 static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
 static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
 static unsigned int metaslab_idx_func(multilist_t *, void *);
 static void metaslab_evict(metaslab_t *, uint64_t);
 static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg);
 kmem_cache_t *metaslab_alloc_trace_cache;
 
 typedef struct metaslab_stats {
 	kstat_named_t metaslabstat_trace_over_limit;
 	kstat_named_t metaslabstat_reload_tree;
 	kstat_named_t metaslabstat_too_many_tries;
 	kstat_named_t metaslabstat_try_hard;
 } metaslab_stats_t;
 
 static metaslab_stats_t metaslab_stats = {
 	{ "trace_over_limit",		KSTAT_DATA_UINT64 },
 	{ "reload_tree",		KSTAT_DATA_UINT64 },
 	{ "too_many_tries",		KSTAT_DATA_UINT64 },
 	{ "try_hard",			KSTAT_DATA_UINT64 },
 };
 
 #define	METASLABSTAT_BUMP(stat) \
 	atomic_inc_64(&metaslab_stats.stat.value.ui64);
 
 
 static kstat_t *metaslab_ksp;
 
 void
 metaslab_stat_init(void)
 {
 	ASSERT(metaslab_alloc_trace_cache == NULL);
 	metaslab_alloc_trace_cache = kmem_cache_create(
 	    "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
 	    0, NULL, NULL, NULL, NULL, NULL, 0);
 	metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats",
 	    "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) /
 	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 	if (metaslab_ksp != NULL) {
 		metaslab_ksp->ks_data = &metaslab_stats;
 		kstat_install(metaslab_ksp);
 	}
 }
 
 void
 metaslab_stat_fini(void)
 {
 	if (metaslab_ksp != NULL) {
 		kstat_delete(metaslab_ksp);
 		metaslab_ksp = NULL;
 	}
 
 	kmem_cache_destroy(metaslab_alloc_trace_cache);
 	metaslab_alloc_trace_cache = NULL;
 }
 
 /*
  * ==========================================================================
  * Metaslab classes
  * ==========================================================================
  */
 metaslab_class_t *
 metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops)
 {
 	metaslab_class_t *mc;
 
 	mc = kmem_zalloc(offsetof(metaslab_class_t,
 	    mc_allocator[spa->spa_alloc_count]), KM_SLEEP);
 
 	mc->mc_spa = spa;
 	mc->mc_ops = ops;
 	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 	multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t),
 	    offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
 		mca->mca_rotor = NULL;
 		zfs_refcount_create_tracked(&mca->mca_alloc_slots);
 	}
 
 	return (mc);
 }
 
 void
 metaslab_class_destroy(metaslab_class_t *mc)
 {
 	spa_t *spa = mc->mc_spa;
 
 	ASSERT(mc->mc_alloc == 0);
 	ASSERT(mc->mc_deferred == 0);
 	ASSERT(mc->mc_space == 0);
 	ASSERT(mc->mc_dspace == 0);
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
 		ASSERT(mca->mca_rotor == NULL);
 		zfs_refcount_destroy(&mca->mca_alloc_slots);
 	}
 	mutex_destroy(&mc->mc_lock);
 	multilist_destroy(&mc->mc_metaslab_txg_list);
 	kmem_free(mc, offsetof(metaslab_class_t,
 	    mc_allocator[spa->spa_alloc_count]));
 }
 
 int
 metaslab_class_validate(metaslab_class_t *mc)
 {
 	metaslab_group_t *mg;
 	vdev_t *vd;
 
 	/*
 	 * Must hold one of the spa_config locks.
 	 */
 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 
 	if ((mg = mc->mc_allocator[0].mca_rotor) == NULL)
 		return (0);
 
 	do {
 		vd = mg->mg_vd;
 		ASSERT(vd->vdev_mg != NULL);
 		ASSERT3P(vd->vdev_top, ==, vd);
 		ASSERT3P(mg->mg_class, ==, mc);
 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 	} while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor);
 
 	return (0);
 }
 
 static void
 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 {
 	atomic_add_64(&mc->mc_alloc, alloc_delta);
 	atomic_add_64(&mc->mc_deferred, defer_delta);
 	atomic_add_64(&mc->mc_space, space_delta);
 	atomic_add_64(&mc->mc_dspace, dspace_delta);
 }
 
 uint64_t
 metaslab_class_get_alloc(metaslab_class_t *mc)
 {
 	return (mc->mc_alloc);
 }
 
 uint64_t
 metaslab_class_get_deferred(metaslab_class_t *mc)
 {
 	return (mc->mc_deferred);
 }
 
 uint64_t
 metaslab_class_get_space(metaslab_class_t *mc)
 {
 	return (mc->mc_space);
 }
 
 uint64_t
 metaslab_class_get_dspace(metaslab_class_t *mc)
 {
 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 }
 
 void
 metaslab_class_histogram_verify(metaslab_class_t *mc)
 {
 	spa_t *spa = mc->mc_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t *mc_hist;
 	int i;
 
 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 		return;
 
 	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 	    KM_SLEEP);
 
 	mutex_enter(&mc->mc_lock);
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = vdev_get_mg(tvd, mc);
 
 		/*
 		 * Skip any holes, uninitialized top-levels, or
 		 * vdevs that are not in this metalab class.
 		 */
 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		IMPLY(mg == mg->mg_vd->vdev_log_mg,
 		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 
 		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 			mc_hist[i] += mg->mg_histogram[i];
 	}
 
 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
 		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 	}
 
 	mutex_exit(&mc->mc_lock);
 	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 }
 
 /*
  * Calculate the metaslab class's fragmentation metric. The metric
  * is weighted based on the space contribution of each metaslab group.
  * The return value will be a number between 0 and 100 (inclusive), or
  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
  * zfs_frag_table for more information about the metric.
  */
 uint64_t
 metaslab_class_fragmentation(metaslab_class_t *mc)
 {
 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 	uint64_t fragmentation = 0;
 
 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		/*
 		 * Skip any holes, uninitialized top-levels,
 		 * or vdevs that are not in this metalab class.
 		 */
 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		/*
 		 * If a metaslab group does not contain a fragmentation
 		 * metric then just bail out.
 		 */
 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 			return (ZFS_FRAG_INVALID);
 		}
 
 		/*
 		 * Determine how much this metaslab_group is contributing
 		 * to the overall pool fragmentation metric.
 		 */
 		fragmentation += mg->mg_fragmentation *
 		    metaslab_group_get_space(mg);
 	}
 	fragmentation /= metaslab_class_get_space(mc);
 
 	ASSERT3U(fragmentation, <=, 100);
 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 	return (fragmentation);
 }
 
 /*
  * Calculate the amount of expandable space that is available in
  * this metaslab class. If a device is expanded then its expandable
  * space will be the amount of allocatable space that is currently not
  * part of this metaslab class.
  */
 uint64_t
 metaslab_class_expandable_space(metaslab_class_t *mc)
 {
 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 	uint64_t space = 0;
 
 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		/*
 		 * Calculate if we have enough space to add additional
 		 * metaslabs. We report the expandable space in terms
 		 * of the metaslab size since that's the unit of expansion.
 		 */
 		space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
 		    1ULL << tvd->vdev_ms_shift);
 	}
 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 	return (space);
 }
 
 void
 metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 {
 	multilist_t *ml = &mc->mc_metaslab_txg_list;
 	for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
 		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 
 			/*
 			 * If the metaslab has been removed from the list
 			 * (which could happen if we were at the memory limit
 			 * and it was evicted during this loop), then we can't
 			 * proceed and we should restart the sublist.
 			 */
 			if (!multilist_link_active(&msp->ms_class_txg_node)) {
 				mutex_exit(&msp->ms_lock);
 				i--;
 				break;
 			}
 			mls = multilist_sublist_lock(ml, i);
 			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 			multilist_sublist_unlock(mls);
 			if (txg >
 			    msp->ms_selected_txg + metaslab_unload_delay &&
 			    gethrtime() > msp->ms_selected_time +
 			    (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) {
 				metaslab_evict(msp, txg);
 			} else {
 				/*
 				 * Once we've hit a metaslab selected too
 				 * recently to evict, we're done evicting for
 				 * now.
 				 */
 				mutex_exit(&msp->ms_lock);
 				break;
 			}
 			mutex_exit(&msp->ms_lock);
 			msp = next_msp;
 		}
 	}
 }
 
 static int
 metaslab_compare(const void *x1, const void *x2)
 {
 	const metaslab_t *m1 = (const metaslab_t *)x1;
 	const metaslab_t *m2 = (const metaslab_t *)x2;
 
 	int sort1 = 0;
 	int sort2 = 0;
 	if (m1->ms_allocator != -1 && m1->ms_primary)
 		sort1 = 1;
 	else if (m1->ms_allocator != -1 && !m1->ms_primary)
 		sort1 = 2;
 	if (m2->ms_allocator != -1 && m2->ms_primary)
 		sort2 = 1;
 	else if (m2->ms_allocator != -1 && !m2->ms_primary)
 		sort2 = 2;
 
 	/*
 	 * Sort inactive metaslabs first, then primaries, then secondaries. When
 	 * selecting a metaslab to allocate from, an allocator first tries its
 	 * primary, then secondary active metaslab. If it doesn't have active
 	 * metaslabs, or can't allocate from them, it searches for an inactive
 	 * metaslab to activate. If it can't find a suitable one, it will steal
 	 * a primary or secondary metaslab from another allocator.
 	 */
 	if (sort1 < sort2)
 		return (-1);
 	if (sort1 > sort2)
 		return (1);
 
 	int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight);
 	if (likely(cmp))
 		return (cmp);
 
 	IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
 
 	return (TREE_CMP(m1->ms_start, m2->ms_start));
 }
 
 /*
  * ==========================================================================
  * Metaslab groups
  * ==========================================================================
  */
 /*
  * Update the allocatable flag and the metaslab group's capacity.
  * The allocatable flag is set to true if the capacity is below
  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
  * transitions from allocatable to non-allocatable or vice versa then the
  * metaslab group's class is updated to reflect the transition.
  */
 static void
 metaslab_group_alloc_update(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	metaslab_class_t *mc = mg->mg_class;
 	vdev_stat_t *vs = &vd->vdev_stat;
 	boolean_t was_allocatable;
 	boolean_t was_initialized;
 
 	ASSERT(vd == vd->vdev_top);
 	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
 	    SCL_ALLOC);
 
 	mutex_enter(&mg->mg_lock);
 	was_allocatable = mg->mg_allocatable;
 	was_initialized = mg->mg_initialized;
 
 	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 	    (vs->vs_space + 1);
 
 	mutex_enter(&mc->mc_lock);
 
 	/*
 	 * If the metaslab group was just added then it won't
 	 * have any space until we finish syncing out this txg.
 	 * At that point we will consider it initialized and available
 	 * for allocations.  We also don't consider non-activated
 	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
 	 * to be initialized, because they can't be used for allocation.
 	 */
 	mg->mg_initialized = metaslab_group_initialized(mg);
 	if (!was_initialized && mg->mg_initialized) {
 		mc->mc_groups++;
 	} else if (was_initialized && !mg->mg_initialized) {
 		ASSERT3U(mc->mc_groups, >, 0);
 		mc->mc_groups--;
 	}
 	if (mg->mg_initialized)
 		mg->mg_no_free_space = B_FALSE;
 
 	/*
 	 * A metaslab group is considered allocatable if it has plenty
 	 * of free space or is not heavily fragmented. We only take
 	 * fragmentation into account if the metaslab group has a valid
 	 * fragmentation metric (i.e. a value between 0 and 100).
 	 */
 	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
 	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 
 	/*
 	 * The mc_alloc_groups maintains a count of the number of
 	 * groups in this metaslab class that are still above the
 	 * zfs_mg_noalloc_threshold. This is used by the allocating
 	 * threads to determine if they should avoid allocations to
 	 * a given group. The allocator will avoid allocations to a group
 	 * if that group has reached or is below the zfs_mg_noalloc_threshold
 	 * and there are still other groups that are above the threshold.
 	 * When a group transitions from allocatable to non-allocatable or
 	 * vice versa we update the metaslab class to reflect that change.
 	 * When the mc_alloc_groups value drops to 0 that means that all
 	 * groups have reached the zfs_mg_noalloc_threshold making all groups
 	 * eligible for allocations. This effectively means that all devices
 	 * are balanced again.
 	 */
 	if (was_allocatable && !mg->mg_allocatable)
 		mc->mc_alloc_groups--;
 	else if (!was_allocatable && mg->mg_allocatable)
 		mc->mc_alloc_groups++;
 	mutex_exit(&mc->mc_lock);
 
 	mutex_exit(&mg->mg_lock);
 }
 
 int
 metaslab_sort_by_flushed(const void *va, const void *vb)
 {
 	const metaslab_t *a = va;
 	const metaslab_t *b = vb;
 
 	int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
 	if (likely(cmp))
 		return (cmp);
 
 	uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
 	uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
 	cmp = TREE_CMP(a_vdev_id, b_vdev_id);
 	if (cmp)
 		return (cmp);
 
 	return (TREE_CMP(a->ms_id, b->ms_id));
 }
 
 metaslab_group_t *
 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 {
 	metaslab_group_t *mg;
 
 	mg = kmem_zalloc(offsetof(metaslab_group_t,
 	    mg_allocator[allocators]), KM_SLEEP);
 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 	    sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
 	mg->mg_vd = vd;
 	mg->mg_class = mc;
 	mg->mg_activation_count = 0;
 	mg->mg_initialized = B_FALSE;
 	mg->mg_no_free_space = B_TRUE;
 	mg->mg_allocators = allocators;
 
 	for (int i = 0; i < allocators; i++) {
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 		zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
 	}
 
 	return (mg);
 }
 
 void
 metaslab_group_destroy(metaslab_group_t *mg)
 {
 	ASSERT(mg->mg_prev == NULL);
 	ASSERT(mg->mg_next == NULL);
 	/*
 	 * We may have gone below zero with the activation count
 	 * either because we never activated in the first place or
 	 * because we're done, and possibly removing the vdev.
 	 */
 	ASSERT(mg->mg_activation_count <= 0);
 
 	avl_destroy(&mg->mg_metaslab_tree);
 	mutex_destroy(&mg->mg_lock);
 	mutex_destroy(&mg->mg_ms_disabled_lock);
 	cv_destroy(&mg->mg_ms_disabled_cv);
 
 	for (int i = 0; i < mg->mg_allocators; i++) {
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 		zfs_refcount_destroy(&mga->mga_alloc_queue_depth);
 	}
 	kmem_free(mg, offsetof(metaslab_group_t,
 	    mg_allocator[mg->mg_allocators]));
 }
 
 void
 metaslab_group_activate(metaslab_group_t *mg)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	spa_t *spa = mc->mc_spa;
 	metaslab_group_t *mgprev, *mgnext;
 
 	ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0);
 
 	ASSERT(mg->mg_prev == NULL);
 	ASSERT(mg->mg_next == NULL);
 	ASSERT(mg->mg_activation_count <= 0);
 
 	if (++mg->mg_activation_count <= 0)
 		return;
 
 	mg->mg_aliquot = metaslab_aliquot * MAX(1,
 	    vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd));
 	metaslab_group_alloc_update(mg);
 
 	if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) {
 		mg->mg_prev = mg;
 		mg->mg_next = mg;
 	} else {
 		mgnext = mgprev->mg_next;
 		mg->mg_prev = mgprev;
 		mg->mg_next = mgnext;
 		mgprev->mg_next = mg;
 		mgnext->mg_prev = mg;
 	}
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		mc->mc_allocator[i].mca_rotor = mg;
 		mg = mg->mg_next;
 	}
 }
 
 /*
  * Passivate a metaslab group and remove it from the allocation rotor.
  * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
  * a metaslab group. This function will momentarily drop spa_config_locks
  * that are lower than the SCL_ALLOC lock (see comment below).
  */
 void
 metaslab_group_passivate(metaslab_group_t *mg)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	spa_t *spa = mc->mc_spa;
 	metaslab_group_t *mgprev, *mgnext;
 	int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
 
 	ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
 	    (SCL_ALLOC | SCL_ZIO));
 
 	if (--mg->mg_activation_count != 0) {
 		for (int i = 0; i < spa->spa_alloc_count; i++)
 			ASSERT(mc->mc_allocator[i].mca_rotor != mg);
 		ASSERT(mg->mg_prev == NULL);
 		ASSERT(mg->mg_next == NULL);
 		ASSERT(mg->mg_activation_count < 0);
 		return;
 	}
 
 	/*
 	 * The spa_config_lock is an array of rwlocks, ordered as
 	 * follows (from highest to lowest):
 	 *	SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
 	 *	SCL_ZIO > SCL_FREE > SCL_VDEV
 	 * (For more information about the spa_config_lock see spa_misc.c)
 	 * The higher the lock, the broader its coverage. When we passivate
 	 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
 	 * config locks. However, the metaslab group's taskq might be trying
 	 * to preload metaslabs so we must drop the SCL_ZIO lock and any
 	 * lower locks to allow the I/O to complete. At a minimum,
 	 * we continue to hold the SCL_ALLOC lock, which prevents any future
 	 * allocations from taking place and any changes to the vdev tree.
 	 */
 	spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
 	taskq_wait_outstanding(spa->spa_metaslab_taskq, 0);
 	spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 	metaslab_group_alloc_update(mg);
 	for (int i = 0; i < mg->mg_allocators; i++) {
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 		metaslab_t *msp = mga->mga_primary;
 		if (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 			metaslab_passivate(msp,
 			    metaslab_weight_from_range_tree(msp));
 			mutex_exit(&msp->ms_lock);
 		}
 		msp = mga->mga_secondary;
 		if (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 			metaslab_passivate(msp,
 			    metaslab_weight_from_range_tree(msp));
 			mutex_exit(&msp->ms_lock);
 		}
 	}
 
 	mgprev = mg->mg_prev;
 	mgnext = mg->mg_next;
 
 	if (mg == mgnext) {
 		mgnext = NULL;
 	} else {
 		mgprev->mg_next = mgnext;
 		mgnext->mg_prev = mgprev;
 	}
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		if (mc->mc_allocator[i].mca_rotor == mg)
 			mc->mc_allocator[i].mca_rotor = mgnext;
 	}
 
 	mg->mg_prev = NULL;
 	mg->mg_next = NULL;
 }
 
 boolean_t
 metaslab_group_initialized(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	vdev_stat_t *vs = &vd->vdev_stat;
 
 	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
 }
 
 uint64_t
 metaslab_group_get_space(metaslab_group_t *mg)
 {
 	/*
 	 * Note that the number of nodes in mg_metaslab_tree may be one less
 	 * than vdev_ms_count, due to the embedded log metaslab.
 	 */
 	mutex_enter(&mg->mg_lock);
 	uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree);
 	mutex_exit(&mg->mg_lock);
 	return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count);
 }
 
 void
 metaslab_group_histogram_verify(metaslab_group_t *mg)
 {
 	uint64_t *mg_hist;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	uint64_t ashift = mg->mg_vd->vdev_ashift;
 
 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 		return;
 
 	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 	    KM_SLEEP);
 
 	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
 
 	mutex_enter(&mg->mg_lock);
 	for (metaslab_t *msp = avl_first(t);
 	    msp != NULL; msp = AVL_NEXT(t, msp)) {
 		VERIFY3P(msp->ms_group, ==, mg);
 		/* skip if not active */
 		if (msp->ms_sm == NULL)
 			continue;
 
 		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 			mg_hist[i + ashift] +=
 			    msp->ms_sm->sm_phys->smp_histogram[i];
 		}
 	}
 
 	for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 
 	mutex_exit(&mg->mg_lock);
 
 	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 }
 
 static void
 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	uint64_t ashift = mg->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	if (msp->ms_sm == NULL)
 		return;
 
 	mutex_enter(&mg->mg_lock);
 	mutex_enter(&mc->mc_lock);
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		IMPLY(mg == mg->mg_vd->vdev_log_mg,
 		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 		mg->mg_histogram[i + ashift] +=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 		mc->mc_histogram[i + ashift] +=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 	}
 	mutex_exit(&mc->mc_lock);
 	mutex_exit(&mg->mg_lock);
 }
 
 void
 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	uint64_t ashift = mg->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	if (msp->ms_sm == NULL)
 		return;
 
 	mutex_enter(&mg->mg_lock);
 	mutex_enter(&mc->mc_lock);
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		ASSERT3U(mg->mg_histogram[i + ashift], >=,
 		    msp->ms_sm->sm_phys->smp_histogram[i]);
 		ASSERT3U(mc->mc_histogram[i + ashift], >=,
 		    msp->ms_sm->sm_phys->smp_histogram[i]);
 		IMPLY(mg == mg->mg_vd->vdev_log_mg,
 		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 
 		mg->mg_histogram[i + ashift] -=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 		mc->mc_histogram[i + ashift] -=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 	}
 	mutex_exit(&mc->mc_lock);
 	mutex_exit(&mg->mg_lock);
 }
 
 static void
 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 {
 	ASSERT(msp->ms_group == NULL);
 	mutex_enter(&mg->mg_lock);
 	msp->ms_group = mg;
 	msp->ms_weight = 0;
 	avl_add(&mg->mg_metaslab_tree, msp);
 	mutex_exit(&mg->mg_lock);
 
 	mutex_enter(&msp->ms_lock);
 	metaslab_group_histogram_add(mg, msp);
 	mutex_exit(&msp->ms_lock);
 }
 
 static void
 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 {
 	mutex_enter(&msp->ms_lock);
 	metaslab_group_histogram_remove(mg, msp);
 	mutex_exit(&msp->ms_lock);
 
 	mutex_enter(&mg->mg_lock);
 	ASSERT(msp->ms_group == mg);
 	avl_remove(&mg->mg_metaslab_tree, msp);
 
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 	multilist_sublist_t *mls =
 	    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 	if (multilist_link_active(&msp->ms_class_txg_node))
 		multilist_sublist_remove(mls, msp);
 	multilist_sublist_unlock(mls);
 
 	msp->ms_group = NULL;
 	mutex_exit(&mg->mg_lock);
 }
 
 static void
 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(MUTEX_HELD(&mg->mg_lock));
 	ASSERT(msp->ms_group == mg);
 
 	avl_remove(&mg->mg_metaslab_tree, msp);
 	msp->ms_weight = weight;
 	avl_add(&mg->mg_metaslab_tree, msp);
 
 }
 
 static void
 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 {
 	/*
 	 * Although in principle the weight can be any value, in
 	 * practice we do not use values in the range [1, 511].
 	 */
 	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	mutex_enter(&mg->mg_lock);
 	metaslab_group_sort_impl(mg, msp, weight);
 	mutex_exit(&mg->mg_lock);
 }
 
 /*
  * Calculate the fragmentation for a given metaslab group. We can use
  * a simple average here since all metaslabs within the group must have
  * the same size. The return value will be a value between 0 and 100
  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
  * group have a fragmentation metric.
  */
 uint64_t
 metaslab_group_fragmentation(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	uint64_t fragmentation = 0;
 	uint64_t valid_ms = 0;
 
 	for (int m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *msp = vd->vdev_ms[m];
 
 		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
 			continue;
 		if (msp->ms_group != mg)
 			continue;
 
 		valid_ms++;
 		fragmentation += msp->ms_fragmentation;
 	}
 
 	if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
 		return (ZFS_FRAG_INVALID);
 
 	fragmentation /= valid_ms;
 	ASSERT3U(fragmentation, <=, 100);
 	return (fragmentation);
 }
 
 /*
  * Determine if a given metaslab group should skip allocations. A metaslab
  * group should avoid allocations if its free capacity is less than the
  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
  * that can still handle allocations. If the allocation throttle is enabled
  * then we skip allocations to devices that have reached their maximum
  * allocation queue depth unless the selected metaslab group is the only
  * eligible group remaining.
  */
 static boolean_t
 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
     int flags, uint64_t psize, int allocator, int d)
 {
 	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_class_t *mc = mg->mg_class;
 
 	/*
 	 * We can only consider skipping this metaslab group if it's
 	 * in the normal metaslab class and there are other metaslab
 	 * groups to select from. Otherwise, we always consider it eligible
 	 * for allocations.
 	 */
 	if ((mc != spa_normal_class(spa) &&
 	    mc != spa_special_class(spa) &&
 	    mc != spa_dedup_class(spa)) ||
 	    mc->mc_groups <= 1)
 		return (B_TRUE);
 
 	/*
 	 * If the metaslab group's mg_allocatable flag is set (see comments
 	 * in metaslab_group_alloc_update() for more information) and
 	 * the allocation throttle is disabled then allow allocations to this
 	 * device. However, if the allocation throttle is enabled then
 	 * check if we have reached our allocation limit (mga_alloc_queue_depth)
 	 * to determine if we should allow allocations to this metaslab group.
 	 * If all metaslab groups are no longer considered allocatable
 	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
 	 * gang block size then we allow allocations on this metaslab group
 	 * regardless of the mg_allocatable or throttle settings.
 	 */
 	if (mg->mg_allocatable) {
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 		int64_t qdepth;
 		uint64_t qmax = mga->mga_cur_max_alloc_queue_depth;
 
 		if (!mc->mc_alloc_throttle_enabled)
 			return (B_TRUE);
 
 		/*
 		 * If this metaslab group does not have any free space, then
 		 * there is no point in looking further.
 		 */
 		if (mg->mg_no_free_space)
 			return (B_FALSE);
 
 		/*
 		 * Some allocations (e.g., those coming from device removal
 		 * where the * allocations are not even counted in the
 		 * metaslab * allocation queues) are allowed to bypass
 		 * the throttle.
 		 */
 		if (flags & METASLAB_DONT_THROTTLE)
 			return (B_TRUE);
 
 		/*
 		 * Relax allocation throttling for ditto blocks.  Due to
 		 * random imbalances in allocation it tends to push copies
 		 * to one vdev, that looks a bit better at the moment.
 		 */
 		qmax = qmax * (4 + d) / 4;
 
 		qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth);
 
 		/*
 		 * If this metaslab group is below its qmax or it's
 		 * the only allocatable metaslab group, then attempt
 		 * to allocate from it.
 		 */
 		if (qdepth < qmax || mc->mc_alloc_groups == 1)
 			return (B_TRUE);
 		ASSERT3U(mc->mc_alloc_groups, >, 1);
 
 		/*
 		 * Since this metaslab group is at or over its qmax, we
 		 * need to determine if there are metaslab groups after this
 		 * one that might be able to handle this allocation. This is
 		 * racy since we can't hold the locks for all metaslab
 		 * groups at the same time when we make this check.
 		 */
 		for (metaslab_group_t *mgp = mg->mg_next;
 		    mgp != rotor; mgp = mgp->mg_next) {
 			metaslab_group_allocator_t *mgap =
 			    &mgp->mg_allocator[allocator];
 			qmax = mgap->mga_cur_max_alloc_queue_depth;
 			qmax = qmax * (4 + d) / 4;
 			qdepth =
 			    zfs_refcount_count(&mgap->mga_alloc_queue_depth);
 
 			/*
 			 * If there is another metaslab group that
 			 * might be able to handle the allocation, then
 			 * we return false so that we skip this group.
 			 */
 			if (qdepth < qmax && !mgp->mg_no_free_space)
 				return (B_FALSE);
 		}
 
 		/*
 		 * We didn't find another group to handle the allocation
 		 * so we can't skip this metaslab group even though
 		 * we are at or over our qmax.
 		 */
 		return (B_TRUE);
 
 	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * ==========================================================================
  * Range tree callbacks
  * ==========================================================================
  */
 
 /*
  * Comparison function for the private size-ordered tree using 32-bit
  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
  */
 __attribute__((always_inline)) inline
 static int
 metaslab_rangesize32_compare(const void *x1, const void *x2)
 {
 	const range_seg32_t *r1 = x1;
 	const range_seg32_t *r2 = x2;
 
 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 
 	int cmp = TREE_CMP(rs_size1, rs_size2);
 
 	return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
 }
 
 /*
  * Comparison function for the private size-ordered tree using 64-bit
  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
  */
 __attribute__((always_inline)) inline
 static int
 metaslab_rangesize64_compare(const void *x1, const void *x2)
 {
 	const range_seg64_t *r1 = x1;
 	const range_seg64_t *r2 = x2;
 
 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 
 	int cmp = TREE_CMP(rs_size1, rs_size2);
 
 	return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
 }
 
 typedef struct metaslab_rt_arg {
 	zfs_btree_t *mra_bt;
 	uint32_t mra_floor_shift;
 } metaslab_rt_arg_t;
 
 struct mssa_arg {
 	range_tree_t *rt;
 	metaslab_rt_arg_t *mra;
 };
 
 static void
 metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size)
 {
 	struct mssa_arg *mssap = arg;
 	range_tree_t *rt = mssap->rt;
 	metaslab_rt_arg_t *mrap = mssap->mra;
 	range_seg_max_t seg = {0};
 	rs_set_start(&seg, rt, start);
 	rs_set_end(&seg, rt, start + size);
 	metaslab_rt_add(rt, &seg, mrap);
 }
 
 static void
 metaslab_size_tree_full_load(range_tree_t *rt)
 {
 	metaslab_rt_arg_t *mrap = rt->rt_arg;
 	METASLABSTAT_BUMP(metaslabstat_reload_tree);
 	ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
 	mrap->mra_floor_shift = 0;
 	struct mssa_arg arg = {0};
 	arg.rt = rt;
 	arg.mra = mrap;
 	range_tree_walk(rt, metaslab_size_sorted_add, &arg);
 }
 
 
 ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf,
     range_seg32_t, metaslab_rangesize32_compare)
 
 ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf,
     range_seg64_t, metaslab_rangesize64_compare)
 
 /*
  * Create any block allocator specific components. The current allocators
  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
  */
 static void
 metaslab_rt_create(range_tree_t *rt, void *arg)
 {
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
 	size_t size;
 	int (*compare) (const void *, const void *);
 	bt_find_in_buf_f bt_find;
 	switch (rt->rt_type) {
 	case RANGE_SEG32:
 		size = sizeof (range_seg32_t);
 		compare = metaslab_rangesize32_compare;
 		bt_find = metaslab_rt_find_rangesize32_in_buf;
 		break;
 	case RANGE_SEG64:
 		size = sizeof (range_seg64_t);
 		compare = metaslab_rangesize64_compare;
 		bt_find = metaslab_rt_find_rangesize64_in_buf;
 		break;
 	default:
 		panic("Invalid range seg type %d", rt->rt_type);
 	}
 	zfs_btree_create(size_tree, compare, bt_find, size);
 	mrap->mra_floor_shift = metaslab_by_size_min_shift;
 }
 
 static void
 metaslab_rt_destroy(range_tree_t *rt, void *arg)
 {
 	(void) rt;
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
 	zfs_btree_destroy(size_tree);
 	kmem_free(mrap, sizeof (*mrap));
 }
 
 static void
 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
 	if (rs_get_end(rs, rt) - rs_get_start(rs, rt) <
 	    (1ULL << mrap->mra_floor_shift))
 		return;
 
 	zfs_btree_add(size_tree, rs);
 }
 
 static void
 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
 	if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1ULL <<
 	    mrap->mra_floor_shift))
 		return;
 
 	zfs_btree_remove(size_tree, rs);
 }
 
 static void
 metaslab_rt_vacate(range_tree_t *rt, void *arg)
 {
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 	zfs_btree_clear(size_tree);
 	zfs_btree_destroy(size_tree);
 
 	metaslab_rt_create(rt, arg);
 }
 
 static const range_tree_ops_t metaslab_rt_ops = {
 	.rtop_create = metaslab_rt_create,
 	.rtop_destroy = metaslab_rt_destroy,
 	.rtop_add = metaslab_rt_add,
 	.rtop_remove = metaslab_rt_remove,
 	.rtop_vacate = metaslab_rt_vacate
 };
 
 /*
  * ==========================================================================
  * Common allocator routines
  * ==========================================================================
  */
 
 /*
  * Return the maximum contiguous segment within the metaslab.
  */
 uint64_t
 metaslab_largest_allocatable(metaslab_t *msp)
 {
 	zfs_btree_t *t = &msp->ms_allocatable_by_size;
 	range_seg_t *rs;
 
 	if (t == NULL)
 		return (0);
 	if (zfs_btree_numnodes(t) == 0)
 		metaslab_size_tree_full_load(msp->ms_allocatable);
 
 	rs = zfs_btree_last(t, NULL);
 	if (rs == NULL)
 		return (0);
 
 	return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs,
 	    msp->ms_allocatable));
 }
 
 /*
  * Return the maximum contiguous segment within the unflushed frees of this
  * metaslab.
  */
 static uint64_t
 metaslab_largest_unflushed_free(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if (msp->ms_unflushed_frees == NULL)
 		return (0);
 
 	if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0)
 		metaslab_size_tree_full_load(msp->ms_unflushed_frees);
 	range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size,
 	    NULL);
 	if (rs == NULL)
 		return (0);
 
 	/*
 	 * When a range is freed from the metaslab, that range is added to
 	 * both the unflushed frees and the deferred frees. While the block
 	 * will eventually be usable, if the metaslab were loaded the range
 	 * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
 	 * txgs had passed.  As a result, when attempting to estimate an upper
 	 * bound for the largest currently-usable free segment in the
 	 * metaslab, we need to not consider any ranges currently in the defer
 	 * trees. This algorithm approximates the largest available chunk in
 	 * the largest range in the unflushed_frees tree by taking the first
 	 * chunk.  While this may be a poor estimate, it should only remain so
 	 * briefly and should eventually self-correct as frees are no longer
 	 * deferred. Similar logic applies to the ms_freed tree. See
 	 * metaslab_load() for more details.
 	 *
 	 * There are two primary sources of inaccuracy in this estimate. Both
 	 * are tolerated for performance reasons. The first source is that we
 	 * only check the largest segment for overlaps. Smaller segments may
 	 * have more favorable overlaps with the other trees, resulting in
 	 * larger usable chunks.  Second, we only look at the first chunk in
 	 * the largest segment; there may be other usable chunks in the
 	 * largest segment, but we ignore them.
 	 */
 	uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees);
 	uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart;
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		uint64_t start = 0;
 		uint64_t size = 0;
 		boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
 		    rsize, &start, &size);
 		if (found) {
 			if (rstart == start)
 				return (0);
 			rsize = start - rstart;
 		}
 	}
 
 	uint64_t start = 0;
 	uint64_t size = 0;
 	boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
 	    rsize, &start, &size);
 	if (found)
 		rsize = start - rstart;
 
 	return (rsize);
 }
 
 static range_seg_t *
 metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
     uint64_t size, zfs_btree_index_t *where)
 {
 	range_seg_t *rs;
 	range_seg_max_t rsearch;
 
 	rs_set_start(&rsearch, rt, start);
 	rs_set_end(&rsearch, rt, start + size);
 
 	rs = zfs_btree_find(t, &rsearch, where);
 	if (rs == NULL) {
 		rs = zfs_btree_next(t, where, where);
 	}
 
 	return (rs);
 }
 
 /*
  * This is a helper function that can be used by the allocator to find a
  * suitable block to allocate. This will search the specified B-tree looking
  * for a block that matches the specified criteria.
  */
 static uint64_t
 metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size,
     uint64_t max_search)
 {
 	if (*cursor == 0)
 		*cursor = rt->rt_start;
 	zfs_btree_t *bt = &rt->rt_root;
 	zfs_btree_index_t where;
 	range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where);
 	uint64_t first_found;
 	int count_searched = 0;
 
 	if (rs != NULL)
 		first_found = rs_get_start(rs, rt);
 
 	while (rs != NULL && (rs_get_start(rs, rt) - first_found <=
 	    max_search || count_searched < metaslab_min_search_count)) {
 		uint64_t offset = rs_get_start(rs, rt);
 		if (offset + size <= rs_get_end(rs, rt)) {
 			*cursor = offset + size;
 			return (offset);
 		}
 		rs = zfs_btree_next(bt, &where, &where);
 		count_searched++;
 	}
 
 	*cursor = 0;
 	return (-1ULL);
 }
 
 static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size);
 static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size);
 static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size);
 metaslab_ops_t *metaslab_allocator(spa_t *spa);
 
 static metaslab_ops_t metaslab_allocators[] = {
 	{ "dynamic", metaslab_df_alloc },
 	{ "cursor", metaslab_cf_alloc },
 	{ "new-dynamic", metaslab_ndf_alloc },
 };
 
 static int
 spa_find_allocator_byname(const char *val)
 {
 	int a = ARRAY_SIZE(metaslab_allocators) - 1;
 	if (strcmp("new-dynamic", val) == 0)
 		return (-1); /* remove when ndf is working */
 	for (; a >= 0; a--) {
 		if (strcmp(val, metaslab_allocators[a].msop_name) == 0)
 			return (a);
 	}
 	return (-1);
 }
 
 void
 spa_set_allocator(spa_t *spa, const char *allocator)
 {
 	int a = spa_find_allocator_byname(allocator);
 	if (a < 0) a = 0;
 	spa->spa_active_allocator = a;
 	zfs_dbgmsg("spa allocator: %s\n", metaslab_allocators[a].msop_name);
 }
 
 int
 spa_get_allocator(spa_t *spa)
 {
 	return (spa->spa_active_allocator);
 }
 
 #if defined(_KERNEL)
 int
 param_set_active_allocator_common(const char *val)
 {
 	char *p;
 
 	if (val == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((p = strchr(val, '\n')) != NULL)
 		*p = '\0';
 
 	int a = spa_find_allocator_byname(val);
 	if (a < 0)
 		return (SET_ERROR(EINVAL));
 
 	zfs_active_allocator = metaslab_allocators[a].msop_name;
 	return (0);
 }
 #endif
 
 metaslab_ops_t *
 metaslab_allocator(spa_t *spa)
 {
 	int allocator = spa_get_allocator(spa);
 	return (&metaslab_allocators[allocator]);
 }
 
 /*
  * ==========================================================================
  * Dynamic Fit (df) block allocator
  *
  * Search for a free chunk of at least this size, starting from the last
  * offset (for this alignment of block) looking for up to
  * metaslab_df_max_search bytes (16MB).  If a large enough free chunk is not
  * found within 16MB, then return a free chunk of exactly the requested size (or
  * larger).
  *
  * If it seems like searching from the last offset will be unproductive, skip
  * that and just return a free chunk of exactly the requested size (or larger).
  * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct.  This
  * mechanism is probably not very useful and may be removed in the future.
  *
  * The behavior when not searching can be changed to return the largest free
  * chunk, instead of a free chunk of exactly the requested size, by setting
  * metaslab_df_use_largest_segment.
  * ==========================================================================
  */
 static uint64_t
 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 {
 	/*
 	 * Find the largest power of 2 block size that evenly divides the
 	 * requested size. This is used to try to allocate blocks with similar
 	 * alignment from the same area of the metaslab (i.e. same cursor
 	 * bucket) but it does not guarantee that other allocations sizes
 	 * may exist in the same region.
 	 */
 	uint64_t align = size & -size;
 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
 	range_tree_t *rt = msp->ms_allocatable;
 	uint_t free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 	uint64_t offset;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * If we're running low on space, find a segment based on size,
 	 * rather than iterating based on offset.
 	 */
 	if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
 	    free_pct < metaslab_df_free_pct) {
 		offset = -1;
 	} else {
 		offset = metaslab_block_picker(rt,
 		    cursor, size, metaslab_df_max_search);
 	}
 
 	if (offset == -1) {
 		range_seg_t *rs;
 		if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
 			metaslab_size_tree_full_load(msp->ms_allocatable);
 
 		if (metaslab_df_use_largest_segment) {
 			/* use largest free segment */
 			rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
 		} else {
 			zfs_btree_index_t where;
 			/* use segment of this size, or next largest */
 			rs = metaslab_block_find(&msp->ms_allocatable_by_size,
 			    rt, msp->ms_start, size, &where);
 		}
 		if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs,
 		    rt)) {
 			offset = rs_get_start(rs, rt);
 			*cursor = offset + size;
 		}
 	}
 
 	return (offset);
 }
 
 /*
  * ==========================================================================
  * Cursor fit block allocator -
  * Select the largest region in the metaslab, set the cursor to the beginning
  * of the range and the cursor_end to the end of the range. As allocations
  * are made advance the cursor. Continue allocating from the cursor until
  * the range is exhausted and then find a new range.
  * ==========================================================================
  */
 static uint64_t
 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 {
 	range_tree_t *rt = msp->ms_allocatable;
 	zfs_btree_t *t = &msp->ms_allocatable_by_size;
 	uint64_t *cursor = &msp->ms_lbas[0];
 	uint64_t *cursor_end = &msp->ms_lbas[1];
 	uint64_t offset = 0;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	ASSERT3U(*cursor_end, >=, *cursor);
 
 	if ((*cursor + size) > *cursor_end) {
 		range_seg_t *rs;
 
 		if (zfs_btree_numnodes(t) == 0)
 			metaslab_size_tree_full_load(msp->ms_allocatable);
 		rs = zfs_btree_last(t, NULL);
 		if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) <
 		    size)
 			return (-1ULL);
 
 		*cursor = rs_get_start(rs, rt);
 		*cursor_end = rs_get_end(rs, rt);
 	}
 
 	offset = *cursor;
 	*cursor += size;
 
 	return (offset);
 }
 
 /*
  * ==========================================================================
  * New dynamic fit allocator -
  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
  * contiguous blocks. If no region is found then just use the largest segment
  * that remains.
  * ==========================================================================
  */
 
 /*
  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
  * to request from the allocator.
  */
 uint64_t metaslab_ndf_clump_shift = 4;
 
 static uint64_t
 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 {
 	zfs_btree_t *t = &msp->ms_allocatable->rt_root;
 	range_tree_t *rt = msp->ms_allocatable;
 	zfs_btree_index_t where;
 	range_seg_t *rs;
 	range_seg_max_t rsearch;
 	uint64_t hbit = highbit64(size);
 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
 	uint64_t max_size = metaslab_largest_allocatable(msp);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if (max_size < size)
 		return (-1ULL);
 
 	rs_set_start(&rsearch, rt, *cursor);
 	rs_set_end(&rsearch, rt, *cursor + size);
 
 	rs = zfs_btree_find(t, &rsearch, &where);
 	if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) {
 		t = &msp->ms_allocatable_by_size;
 
 		rs_set_start(&rsearch, rt, 0);
 		rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit +
 		    metaslab_ndf_clump_shift)));
 
 		rs = zfs_btree_find(t, &rsearch, &where);
 		if (rs == NULL)
 			rs = zfs_btree_next(t, &where, &where);
 		ASSERT(rs != NULL);
 	}
 
 	if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) {
 		*cursor = rs_get_start(rs, rt) + size;
 		return (rs_get_start(rs, rt));
 	}
 	return (-1ULL);
 }
 
 /*
  * ==========================================================================
  * Metaslabs
  * ==========================================================================
  */
 
 /*
  * Wait for any in-progress metaslab loads to complete.
  */
 static void
 metaslab_load_wait(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	while (msp->ms_loading) {
 		ASSERT(!msp->ms_loaded);
 		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
 	}
 }
 
 /*
  * Wait for any in-progress flushing to complete.
  */
 static void
 metaslab_flush_wait(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	while (msp->ms_flushing)
 		cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
 }
 
 static unsigned int
 metaslab_idx_func(multilist_t *ml, void *arg)
 {
 	metaslab_t *msp = arg;
 
 	/*
 	 * ms_id values are allocated sequentially, so full 64bit
 	 * division would be a waste of time, so limit it to 32 bits.
 	 */
 	return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml));
 }
 
 uint64_t
 metaslab_allocated_space(metaslab_t *msp)
 {
 	return (msp->ms_allocated_space);
 }
 
 /*
  * Verify that the space accounting on disk matches the in-core range_trees.
  */
 static void
 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	uint64_t allocating = 0;
 	uint64_t sm_free_space, msp_free_space;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(!msp->ms_condensing);
 
 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 		return;
 
 	/*
 	 * We can only verify the metaslab space when we're called
 	 * from syncing context with a loaded metaslab that has an
 	 * allocated space map. Calling this in non-syncing context
 	 * does not provide a consistent view of the metaslab since
 	 * we're performing allocations in the future.
 	 */
 	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 	    !msp->ms_loaded)
 		return;
 
 	/*
 	 * Even though the smp_alloc field can get negative,
 	 * when it comes to a metaslab's space map, that should
 	 * never be the case.
 	 */
 	ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
 
 	ASSERT3U(space_map_allocated(msp->ms_sm), >=,
 	    range_tree_space(msp->ms_unflushed_frees));
 
 	ASSERT3U(metaslab_allocated_space(msp), ==,
 	    space_map_allocated(msp->ms_sm) +
 	    range_tree_space(msp->ms_unflushed_allocs) -
 	    range_tree_space(msp->ms_unflushed_frees));
 
 	sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
 
 	/*
 	 * Account for future allocations since we would have
 	 * already deducted that space from the ms_allocatable.
 	 */
 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 		allocating +=
 		    range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
 	}
 	ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
 	    msp->ms_allocating_total);
 
 	ASSERT3U(msp->ms_deferspace, ==,
 	    range_tree_space(msp->ms_defer[0]) +
 	    range_tree_space(msp->ms_defer[1]));
 
 	msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
 	    msp->ms_deferspace + range_tree_space(msp->ms_freed);
 
 	VERIFY3U(sm_free_space, ==, msp_free_space);
 }
 
 static void
 metaslab_aux_histograms_clear(metaslab_t *msp)
 {
 	/*
 	 * Auxiliary histograms are only cleared when resetting them,
 	 * which can only happen while the metaslab is loaded.
 	 */
 	ASSERT(msp->ms_loaded);
 
 	memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
 	for (int t = 0; t < TXG_DEFER_SIZE; t++)
 		memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t]));
 }
 
 static void
 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
     range_tree_t *rt)
 {
 	/*
 	 * This is modeled after space_map_histogram_add(), so refer to that
 	 * function for implementation details. We want this to work like
 	 * the space map histogram, and not the range tree histogram, as we
 	 * are essentially constructing a delta that will be later subtracted
 	 * from the space map histogram.
 	 */
 	int idx = 0;
 	for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
 		ASSERT3U(i, >=, idx + shift);
 		histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
 
 		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
 			ASSERT3U(idx + shift, ==, i);
 			idx++;
 			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
 		}
 	}
 }
 
 /*
  * Called at every sync pass that the metaslab gets synced.
  *
  * The reason is that we want our auxiliary histograms to be updated
  * wherever the metaslab's space map histogram is updated. This way
  * we stay consistent on which parts of the metaslab space map's
  * histogram are currently not available for allocations (e.g because
  * they are in the defer, freed, and freeing trees).
  */
 static void
 metaslab_aux_histograms_update(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	ASSERT(sm != NULL);
 
 	/*
 	 * This is similar to the metaslab's space map histogram updates
 	 * that take place in metaslab_sync(). The only difference is that
 	 * we only care about segments that haven't made it into the
 	 * ms_allocatable tree yet.
 	 */
 	if (msp->ms_loaded) {
 		metaslab_aux_histograms_clear(msp);
 
 		metaslab_aux_histogram_add(msp->ms_synchist,
 		    sm->sm_shift, msp->ms_freed);
 
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			metaslab_aux_histogram_add(msp->ms_deferhist[t],
 			    sm->sm_shift, msp->ms_defer[t]);
 		}
 	}
 
 	metaslab_aux_histogram_add(msp->ms_synchist,
 	    sm->sm_shift, msp->ms_freeing);
 }
 
 /*
  * Called every time we are done syncing (writing to) the metaslab,
  * i.e. at the end of each sync pass.
  * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
  */
 static void
 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	space_map_t *sm = msp->ms_sm;
 
 	if (sm == NULL) {
 		/*
 		 * We came here from metaslab_init() when creating/opening a
 		 * pool, looking at a metaslab that hasn't had any allocations
 		 * yet.
 		 */
 		return;
 	}
 
 	/*
 	 * This is similar to the actions that we take for the ms_freed
 	 * and ms_defer trees in metaslab_sync_done().
 	 */
 	uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
 	if (defer_allowed) {
 		memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist,
 		    sizeof (msp->ms_synchist));
 	} else {
 		memset(msp->ms_deferhist[hist_index], 0,
 		    sizeof (msp->ms_deferhist[hist_index]));
 	}
 	memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
 }
 
 /*
  * Ensure that the metaslab's weight and fragmentation are consistent
  * with the contents of the histogram (either the range tree's histogram
  * or the space map's depending whether the metaslab is loaded).
  */
 static void
 metaslab_verify_weight_and_frag(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 		return;
 
 	/*
 	 * We can end up here from vdev_remove_complete(), in which case we
 	 * cannot do these assertions because we hold spa config locks and
 	 * thus we are not allowed to read from the DMU.
 	 *
 	 * We check if the metaslab group has been removed and if that's
 	 * the case we return immediately as that would mean that we are
 	 * here from the aforementioned code path.
 	 */
 	if (msp->ms_group == NULL)
 		return;
 
 	/*
 	 * Devices being removed always return a weight of 0 and leave
 	 * fragmentation and ms_max_size as is - there is nothing for
 	 * us to verify here.
 	 */
 	vdev_t *vd = msp->ms_group->mg_vd;
 	if (vd->vdev_removing)
 		return;
 
 	/*
 	 * If the metaslab is dirty it probably means that we've done
 	 * some allocations or frees that have changed our histograms
 	 * and thus the weight.
 	 */
 	for (int t = 0; t < TXG_SIZE; t++) {
 		if (txg_list_member(&vd->vdev_ms_list, msp, t))
 			return;
 	}
 
 	/*
 	 * This verification checks that our in-memory state is consistent
 	 * with what's on disk. If the pool is read-only then there aren't
 	 * any changes and we just have the initially-loaded state.
 	 */
 	if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
 		return;
 
 	/* some extra verification for in-core tree if you can */
 	if (msp->ms_loaded) {
 		range_tree_stat_verify(msp->ms_allocatable);
 		VERIFY(space_map_histogram_verify(msp->ms_sm,
 		    msp->ms_allocatable));
 	}
 
 	uint64_t weight = msp->ms_weight;
 	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 	boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
 	uint64_t frag = msp->ms_fragmentation;
 	uint64_t max_segsize = msp->ms_max_size;
 
 	msp->ms_weight = 0;
 	msp->ms_fragmentation = 0;
 
 	/*
 	 * This function is used for verification purposes and thus should
 	 * not introduce any side-effects/mutations on the system's state.
 	 *
 	 * Regardless of whether metaslab_weight() thinks this metaslab
 	 * should be active or not, we want to ensure that the actual weight
 	 * (and therefore the value of ms_weight) would be the same if it
 	 * was to be recalculated at this point.
 	 *
 	 * In addition we set the nodirty flag so metaslab_weight() does
 	 * not dirty the metaslab for future TXGs (e.g. when trying to
 	 * force condensing to upgrade the metaslab spacemaps).
 	 */
 	msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active;
 
 	VERIFY3U(max_segsize, ==, msp->ms_max_size);
 
 	/*
 	 * If the weight type changed then there is no point in doing
 	 * verification. Revert fields to their original values.
 	 */
 	if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
 	    (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
 		msp->ms_fragmentation = frag;
 		msp->ms_weight = weight;
 		return;
 	}
 
 	VERIFY3U(msp->ms_fragmentation, ==, frag);
 	VERIFY3U(msp->ms_weight, ==, weight);
 }
 
 /*
  * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
  * this class that was used longest ago, and attempt to unload it.  We don't
  * want to spend too much time in this loop to prevent performance
  * degradation, and we expect that most of the time this operation will
  * succeed. Between that and the normal unloading processing during txg sync,
  * we expect this to keep the metaslab memory usage under control.
  */
 static void
 metaslab_potentially_evict(metaslab_class_t *mc)
 {
 #ifdef _KERNEL
 	uint64_t allmem = arc_all_memory();
 	uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
 	uint64_t size =	spl_kmem_cache_entry_size(zfs_btree_leaf_cache);
 	uint_t tries = 0;
 	for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
 	    tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2;
 	    tries++) {
 		unsigned int idx = multilist_get_random_index(
 		    &mc->mc_metaslab_txg_list);
 		multilist_sublist_t *mls =
 		    multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
 		    inuse * size) {
 			VERIFY3P(mls, ==, multilist_sublist_lock(
 			    &mc->mc_metaslab_txg_list, idx));
 			ASSERT3U(idx, ==,
 			    metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
 
 			if (!multilist_link_active(&msp->ms_class_txg_node)) {
 				multilist_sublist_unlock(mls);
 				break;
 			}
 			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 			multilist_sublist_unlock(mls);
 			/*
 			 * If the metaslab is currently loading there are two
 			 * cases. If it's the metaslab we're evicting, we
 			 * can't continue on or we'll panic when we attempt to
 			 * recursively lock the mutex. If it's another
 			 * metaslab that's loading, it can be safely skipped,
 			 * since we know it's very new and therefore not a
 			 * good eviction candidate. We check later once the
 			 * lock is held that the metaslab is fully loaded
 			 * before actually unloading it.
 			 */
 			if (msp->ms_loading) {
 				msp = next_msp;
 				inuse =
 				    spl_kmem_cache_inuse(zfs_btree_leaf_cache);
 				continue;
 			}
 			/*
 			 * We can't unload metaslabs with no spacemap because
 			 * they're not ready to be unloaded yet. We can't
 			 * unload metaslabs with outstanding allocations
 			 * because doing so could cause the metaslab's weight
 			 * to decrease while it's unloaded, which violates an
 			 * invariant that we use to prevent unnecessary
 			 * loading. We also don't unload metaslabs that are
 			 * currently active because they are high-weight
 			 * metaslabs that are likely to be used in the near
 			 * future.
 			 */
 			mutex_enter(&msp->ms_lock);
 			if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
 			    msp->ms_allocating_total == 0) {
 				metaslab_unload(msp);
 			}
 			mutex_exit(&msp->ms_lock);
 			msp = next_msp;
 			inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
 		}
 	}
 #else
 	(void) mc, (void) zfs_metaslab_mem_limit;
 #endif
 }
 
 static int
 metaslab_load_impl(metaslab_t *msp)
 {
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loading);
 	ASSERT(!msp->ms_condensing);
 
 	/*
 	 * We temporarily drop the lock to unblock other operations while we
 	 * are reading the space map. Therefore, metaslab_sync() and
 	 * metaslab_sync_done() can run at the same time as we do.
 	 *
 	 * If we are using the log space maps, metaslab_sync() can't write to
 	 * the metaslab's space map while we are loading as we only write to
 	 * it when we are flushing the metaslab, and that can't happen while
 	 * we are loading it.
 	 *
 	 * If we are not using log space maps though, metaslab_sync() can
 	 * append to the space map while we are loading. Therefore we load
 	 * only entries that existed when we started the load. Additionally,
 	 * metaslab_sync_done() has to wait for the load to complete because
 	 * there are potential races like metaslab_load() loading parts of the
 	 * space map that are currently being appended by metaslab_sync(). If
 	 * we didn't, the ms_allocatable would have entries that
 	 * metaslab_sync_done() would try to re-add later.
 	 *
 	 * That's why before dropping the lock we remember the synced length
 	 * of the metaslab and read up to that point of the space map,
 	 * ignoring entries appended by metaslab_sync() that happen after we
 	 * drop the lock.
 	 */
 	uint64_t length = msp->ms_synced_length;
 	mutex_exit(&msp->ms_lock);
 
 	hrtime_t load_start = gethrtime();
 	metaslab_rt_arg_t *mrap;
 	if (msp->ms_allocatable->rt_arg == NULL) {
 		mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
 	} else {
 		mrap = msp->ms_allocatable->rt_arg;
 		msp->ms_allocatable->rt_ops = NULL;
 		msp->ms_allocatable->rt_arg = NULL;
 	}
 	mrap->mra_bt = &msp->ms_allocatable_by_size;
 	mrap->mra_floor_shift = metaslab_by_size_min_shift;
 
 	if (msp->ms_sm != NULL) {
 		error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
 		    SM_FREE, length);
 
 		/* Now, populate the size-sorted tree. */
 		metaslab_rt_create(msp->ms_allocatable, mrap);
 		msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
 		msp->ms_allocatable->rt_arg = mrap;
 
 		struct mssa_arg arg = {0};
 		arg.rt = msp->ms_allocatable;
 		arg.mra = mrap;
 		range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add,
 		    &arg);
 	} else {
 		/*
 		 * Add the size-sorted tree first, since we don't need to load
 		 * the metaslab from the spacemap.
 		 */
 		metaslab_rt_create(msp->ms_allocatable, mrap);
 		msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
 		msp->ms_allocatable->rt_arg = mrap;
 		/*
 		 * The space map has not been allocated yet, so treat
 		 * all the space in the metaslab as free and add it to the
 		 * ms_allocatable tree.
 		 */
 		range_tree_add(msp->ms_allocatable,
 		    msp->ms_start, msp->ms_size);
 
 		if (msp->ms_new) {
 			/*
 			 * If the ms_sm doesn't exist, this means that this
 			 * metaslab hasn't gone through metaslab_sync() and
 			 * thus has never been dirtied. So we shouldn't
 			 * expect any unflushed allocs or frees from previous
 			 * TXGs.
 			 */
 			ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
 			ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 		}
 	}
 
 	/*
 	 * We need to grab the ms_sync_lock to prevent metaslab_sync() from
 	 * changing the ms_sm (or log_sm) and the metaslab's range trees
 	 * while we are about to use them and populate the ms_allocatable.
 	 * The ms_lock is insufficient for this because metaslab_sync() doesn't
 	 * hold the ms_lock while writing the ms_checkpointing tree to disk.
 	 */
 	mutex_enter(&msp->ms_sync_lock);
 	mutex_enter(&msp->ms_lock);
 
 	ASSERT(!msp->ms_condensing);
 	ASSERT(!msp->ms_flushing);
 
 	if (error != 0) {
 		mutex_exit(&msp->ms_sync_lock);
 		return (error);
 	}
 
 	ASSERT3P(msp->ms_group, !=, NULL);
 	msp->ms_loaded = B_TRUE;
 
 	/*
 	 * Apply all the unflushed changes to ms_allocatable right
 	 * away so any manipulations we do below have a clear view
 	 * of what is allocated and what is free.
 	 */
 	range_tree_walk(msp->ms_unflushed_allocs,
 	    range_tree_remove, msp->ms_allocatable);
 	range_tree_walk(msp->ms_unflushed_frees,
 	    range_tree_add, msp->ms_allocatable);
 
 	ASSERT3P(msp->ms_group, !=, NULL);
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	if (spa_syncing_log_sm(spa) != NULL) {
 		ASSERT(spa_feature_is_enabled(spa,
 		    SPA_FEATURE_LOG_SPACEMAP));
 
 		/*
 		 * If we use a log space map we add all the segments
 		 * that are in ms_unflushed_frees so they are available
 		 * for allocation.
 		 *
 		 * ms_allocatable needs to contain all free segments
 		 * that are ready for allocations (thus not segments
 		 * from ms_freeing, ms_freed, and the ms_defer trees).
 		 * But if we grab the lock in this code path at a sync
 		 * pass later that 1, then it also contains the
 		 * segments of ms_freed (they were added to it earlier
 		 * in this path through ms_unflushed_frees). So we
 		 * need to remove all the segments that exist in
 		 * ms_freed from ms_allocatable as they will be added
 		 * later in metaslab_sync_done().
 		 *
 		 * When there's no log space map, the ms_allocatable
 		 * correctly doesn't contain any segments that exist
 		 * in ms_freed [see ms_synced_length].
 		 */
 		range_tree_walk(msp->ms_freed,
 		    range_tree_remove, msp->ms_allocatable);
 	}
 
 	/*
 	 * If we are not using the log space map, ms_allocatable
 	 * contains the segments that exist in the ms_defer trees
 	 * [see ms_synced_length]. Thus we need to remove them
 	 * from ms_allocatable as they will be added again in
 	 * metaslab_sync_done().
 	 *
 	 * If we are using the log space map, ms_allocatable still
 	 * contains the segments that exist in the ms_defer trees.
 	 * Not because it read them through the ms_sm though. But
 	 * because these segments are part of ms_unflushed_frees
 	 * whose segments we add to ms_allocatable earlier in this
 	 * code path.
 	 */
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		range_tree_walk(msp->ms_defer[t],
 		    range_tree_remove, msp->ms_allocatable);
 	}
 
 	/*
 	 * Call metaslab_recalculate_weight_and_sort() now that the
 	 * metaslab is loaded so we get the metaslab's real weight.
 	 *
 	 * Unless this metaslab was created with older software and
 	 * has not yet been converted to use segment-based weight, we
 	 * expect the new weight to be better or equal to the weight
 	 * that the metaslab had while it was not loaded. This is
 	 * because the old weight does not take into account the
 	 * consolidation of adjacent segments between TXGs. [see
 	 * comment for ms_synchist and ms_deferhist[] for more info]
 	 */
 	uint64_t weight = msp->ms_weight;
 	uint64_t max_size = msp->ms_max_size;
 	metaslab_recalculate_weight_and_sort(msp);
 	if (!WEIGHT_IS_SPACEBASED(weight))
 		ASSERT3U(weight, <=, msp->ms_weight);
 	msp->ms_max_size = metaslab_largest_allocatable(msp);
 	ASSERT3U(max_size, <=, msp->ms_max_size);
 	hrtime_t load_end = gethrtime();
 	msp->ms_load_time = load_end;
 	zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, "
 	    "ms_id %llu, smp_length %llu, "
 	    "unflushed_allocs %llu, unflushed_frees %llu, "
 	    "freed %llu, defer %llu + %llu, unloaded time %llu ms, "
 	    "loading_time %lld ms, ms_max_size %llu, "
 	    "max size error %lld, "
 	    "old_weight %llx, new_weight %llx",
 	    (u_longlong_t)spa_syncing_txg(spa), spa_name(spa),
 	    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 	    (u_longlong_t)msp->ms_id,
 	    (u_longlong_t)space_map_length(msp->ms_sm),
 	    (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs),
 	    (u_longlong_t)range_tree_space(msp->ms_unflushed_frees),
 	    (u_longlong_t)range_tree_space(msp->ms_freed),
 	    (u_longlong_t)range_tree_space(msp->ms_defer[0]),
 	    (u_longlong_t)range_tree_space(msp->ms_defer[1]),
 	    (longlong_t)((load_start - msp->ms_unload_time) / 1000000),
 	    (longlong_t)((load_end - load_start) / 1000000),
 	    (u_longlong_t)msp->ms_max_size,
 	    (u_longlong_t)msp->ms_max_size - max_size,
 	    (u_longlong_t)weight, (u_longlong_t)msp->ms_weight);
 
 	metaslab_verify_space(msp, spa_syncing_txg(spa));
 	mutex_exit(&msp->ms_sync_lock);
 	return (0);
 }
 
 int
 metaslab_load(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * There may be another thread loading the same metaslab, if that's
 	 * the case just wait until the other thread is done and return.
 	 */
 	metaslab_load_wait(msp);
 	if (msp->ms_loaded)
 		return (0);
 	VERIFY(!msp->ms_loading);
 	ASSERT(!msp->ms_condensing);
 
 	/*
 	 * We set the loading flag BEFORE potentially dropping the lock to
 	 * wait for an ongoing flush (see ms_flushing below). This way other
 	 * threads know that there is already a thread that is loading this
 	 * metaslab.
 	 */
 	msp->ms_loading = B_TRUE;
 
 	/*
 	 * Wait for any in-progress flushing to finish as we drop the ms_lock
 	 * both here (during space_map_load()) and in metaslab_flush() (when
 	 * we flush our changes to the ms_sm).
 	 */
 	if (msp->ms_flushing)
 		metaslab_flush_wait(msp);
 
 	/*
 	 * In the possibility that we were waiting for the metaslab to be
 	 * flushed (where we temporarily dropped the ms_lock), ensure that
 	 * no one else loaded the metaslab somehow.
 	 */
 	ASSERT(!msp->ms_loaded);
 
 	/*
 	 * If we're loading a metaslab in the normal class, consider evicting
 	 * another one to keep our memory usage under the limit defined by the
 	 * zfs_metaslab_mem_limit tunable.
 	 */
 	if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
 	    msp->ms_group->mg_class) {
 		metaslab_potentially_evict(msp->ms_group->mg_class);
 	}
 
 	int error = metaslab_load_impl(msp);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	msp->ms_loading = B_FALSE;
 	cv_broadcast(&msp->ms_load_cv);
 
 	return (error);
 }
 
 void
 metaslab_unload(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * This can happen if a metaslab is selected for eviction (in
 	 * metaslab_potentially_evict) and then unloaded during spa_sync (via
 	 * metaslab_class_evict_old).
 	 */
 	if (!msp->ms_loaded)
 		return;
 
 	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 	msp->ms_loaded = B_FALSE;
 	msp->ms_unload_time = gethrtime();
 
 	msp->ms_activation_weight = 0;
 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
 
 	if (msp->ms_group != NULL) {
 		metaslab_class_t *mc = msp->ms_group->mg_class;
 		multilist_sublist_t *mls =
 		    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 		if (multilist_link_active(&msp->ms_class_txg_node))
 			multilist_sublist_remove(mls, msp);
 		multilist_sublist_unlock(mls);
 
 		spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 		zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, "
 		    "ms_id %llu, weight %llx, "
 		    "selected txg %llu (%llu ms ago), alloc_txg %llu, "
 		    "loaded %llu ms ago, max_size %llu",
 		    (u_longlong_t)spa_syncing_txg(spa), spa_name(spa),
 		    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 		    (u_longlong_t)msp->ms_id,
 		    (u_longlong_t)msp->ms_weight,
 		    (u_longlong_t)msp->ms_selected_txg,
 		    (u_longlong_t)(msp->ms_unload_time -
 		    msp->ms_selected_time) / 1000 / 1000,
 		    (u_longlong_t)msp->ms_alloc_txg,
 		    (u_longlong_t)(msp->ms_unload_time -
 		    msp->ms_load_time) / 1000 / 1000,
 		    (u_longlong_t)msp->ms_max_size);
 	}
 
 	/*
 	 * We explicitly recalculate the metaslab's weight based on its space
 	 * map (as it is now not loaded). We want unload metaslabs to always
 	 * have their weights calculated from the space map histograms, while
 	 * loaded ones have it calculated from their in-core range tree
 	 * [see metaslab_load()]. This way, the weight reflects the information
 	 * available in-core, whether it is loaded or not.
 	 *
 	 * If ms_group == NULL means that we came here from metaslab_fini(),
 	 * at which point it doesn't make sense for us to do the recalculation
 	 * and the sorting.
 	 */
 	if (msp->ms_group != NULL)
 		metaslab_recalculate_weight_and_sort(msp);
 }
 
 /*
  * We want to optimize the memory use of the per-metaslab range
  * trees. To do this, we store the segments in the range trees in
  * units of sectors, zero-indexing from the start of the metaslab. If
  * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
  * the ranges using two uint32_ts, rather than two uint64_ts.
  */
 range_seg_type_t
 metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp,
     uint64_t *start, uint64_t *shift)
 {
 	if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 &&
 	    !zfs_metaslab_force_large_segs) {
 		*shift = vdev->vdev_ashift;
 		*start = msp->ms_start;
 		return (RANGE_SEG32);
 	} else {
 		*shift = 0;
 		*start = 0;
 		return (RANGE_SEG64);
 	}
 }
 
 void
 metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 	multilist_sublist_t *mls =
 	    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 	if (multilist_link_active(&msp->ms_class_txg_node))
 		multilist_sublist_remove(mls, msp);
 	msp->ms_selected_txg = txg;
 	msp->ms_selected_time = gethrtime();
 	multilist_sublist_insert_tail(mls, msp);
 	multilist_sublist_unlock(mls);
 }
 
 void
 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
     int64_t defer_delta, int64_t space_delta)
 {
 	vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
 
 	ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
 	ASSERT(vd->vdev_ms_count != 0);
 
 	metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
 	    vdev_deflated_space(vd, space_delta));
 }
 
 int
 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
     uint64_t txg, metaslab_t **msp)
 {
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	metaslab_t *ms;
 	int error;
 
 	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
 	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
 	multilist_link_init(&ms->ms_class_txg_node);
 
 	ms->ms_id = id;
 	ms->ms_start = id << vd->vdev_ms_shift;
 	ms->ms_size = 1ULL << vd->vdev_ms_shift;
 	ms->ms_allocator = -1;
 	ms->ms_new = B_TRUE;
 
 	vdev_ops_t *ops = vd->vdev_ops;
 	if (ops->vdev_op_metaslab_init != NULL)
 		ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size);
 
 	/*
 	 * We only open space map objects that already exist. All others
 	 * will be opened when we finally allocate an object for it. For
 	 * readonly pools there is no need to open the space map object.
 	 *
 	 * Note:
 	 * When called from vdev_expand(), we can't call into the DMU as
 	 * we are holding the spa_config_lock as a writer and we would
 	 * deadlock [see relevant comment in vdev_metaslab_init()]. in
 	 * that case, the object parameter is zero though, so we won't
 	 * call into the DMU.
 	 */
 	if (object != 0 && !(spa->spa_mode == SPA_MODE_READ &&
 	    !spa->spa_read_spacemaps)) {
 		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
 		    ms->ms_size, vd->vdev_ashift);
 
 		if (error != 0) {
 			kmem_free(ms, sizeof (metaslab_t));
 			return (error);
 		}
 
 		ASSERT(ms->ms_sm != NULL);
 		ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
 	}
 
 	uint64_t shift, start;
 	range_seg_type_t type =
 	    metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
 
 	ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift);
 	for (int t = 0; t < TXG_SIZE; t++) {
 		ms->ms_allocating[t] = range_tree_create(NULL, type,
 		    NULL, start, shift);
 	}
 	ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift);
 	ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift);
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		ms->ms_defer[t] = range_tree_create(NULL, type, NULL,
 		    start, shift);
 	}
 	ms->ms_checkpointing =
 	    range_tree_create(NULL, type, NULL, start, shift);
 	ms->ms_unflushed_allocs =
 	    range_tree_create(NULL, type, NULL, start, shift);
 
 	metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
 	mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
 	mrap->mra_floor_shift = metaslab_by_size_min_shift;
 	ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
 	    type, mrap, start, shift);
 
 	ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift);
 
 	metaslab_group_add(mg, ms);
 	metaslab_set_fragmentation(ms, B_FALSE);
 
 	/*
 	 * If we're opening an existing pool (txg == 0) or creating
 	 * a new one (txg == TXG_INITIAL), all space is available now.
 	 * If we're adding space to an existing pool, the new space
 	 * does not become available until after this txg has synced.
 	 * The metaslab's weight will also be initialized when we sync
 	 * out this txg. This ensures that we don't attempt to allocate
 	 * from it before we have initialized it completely.
 	 */
 	if (txg <= TXG_INITIAL) {
 		metaslab_sync_done(ms, 0);
 		metaslab_space_update(vd, mg->mg_class,
 		    metaslab_allocated_space(ms), 0, 0);
 	}
 
 	if (txg != 0) {
 		vdev_dirty(vd, 0, NULL, txg);
 		vdev_dirty(vd, VDD_METASLAB, ms, txg);
 	}
 
 	*msp = ms;
 
 	return (0);
 }
 
 static void
 metaslab_fini_flush_data(metaslab_t *msp)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	if (metaslab_unflushed_txg(msp) == 0) {
 		ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
 		    ==, NULL);
 		return;
 	}
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	mutex_enter(&spa->spa_flushed_ms_lock);
 	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
 	mutex_exit(&spa->spa_flushed_ms_lock);
 
 	spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
 	spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp),
 	    metaslab_unflushed_dirty(msp));
 }
 
 uint64_t
 metaslab_unflushed_changes_memused(metaslab_t *ms)
 {
 	return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
 	    range_tree_numsegs(ms->ms_unflushed_frees)) *
 	    ms->ms_unflushed_allocs->rt_root.bt_elem_size);
 }
 
 void
 metaslab_fini(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 
 	metaslab_fini_flush_data(msp);
 
 	metaslab_group_remove(mg, msp);
 
 	mutex_enter(&msp->ms_lock);
 	VERIFY(msp->ms_group == NULL);
 
 	/*
 	 * If this metaslab hasn't been through metaslab_sync_done() yet its
 	 * space hasn't been accounted for in its vdev and doesn't need to be
 	 * subtracted.
 	 */
 	if (!msp->ms_new) {
 		metaslab_space_update(vd, mg->mg_class,
 		    -metaslab_allocated_space(msp), 0, -msp->ms_size);
 
 	}
 	space_map_close(msp->ms_sm);
 	msp->ms_sm = NULL;
 
 	metaslab_unload(msp);
 
 	range_tree_destroy(msp->ms_allocatable);
 	range_tree_destroy(msp->ms_freeing);
 	range_tree_destroy(msp->ms_freed);
 
 	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 	    metaslab_unflushed_changes_memused(msp));
 	spa->spa_unflushed_stats.sus_memused -=
 	    metaslab_unflushed_changes_memused(msp);
 	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
 	range_tree_destroy(msp->ms_unflushed_allocs);
 	range_tree_destroy(msp->ms_checkpointing);
 	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
 	range_tree_destroy(msp->ms_unflushed_frees);
 
 	for (int t = 0; t < TXG_SIZE; t++) {
 		range_tree_destroy(msp->ms_allocating[t]);
 	}
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		range_tree_destroy(msp->ms_defer[t]);
 	}
 	ASSERT0(msp->ms_deferspace);
 
 	for (int t = 0; t < TXG_SIZE; t++)
 		ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
 
 	range_tree_vacate(msp->ms_trim, NULL, NULL);
 	range_tree_destroy(msp->ms_trim);
 
 	mutex_exit(&msp->ms_lock);
 	cv_destroy(&msp->ms_load_cv);
 	cv_destroy(&msp->ms_flush_cv);
 	mutex_destroy(&msp->ms_lock);
 	mutex_destroy(&msp->ms_sync_lock);
 	ASSERT3U(msp->ms_allocator, ==, -1);
 
 	kmem_free(msp, sizeof (metaslab_t));
 }
 
 #define	FRAGMENTATION_TABLE_SIZE	17
 
 /*
  * This table defines a segment size based fragmentation metric that will
  * allow each metaslab to derive its own fragmentation value. This is done
  * by calculating the space in each bucket of the spacemap histogram and
  * multiplying that by the fragmentation metric in this table. Doing
  * this for all buckets and dividing it by the total amount of free
  * space in this metaslab (i.e. the total free space in all buckets) gives
  * us the fragmentation metric. This means that a high fragmentation metric
  * equates to most of the free space being comprised of small segments.
  * Conversely, if the metric is low, then most of the free space is in
  * large segments. A 10% change in fragmentation equates to approximately
  * double the number of segments.
  *
  * This table defines 0% fragmented space using 16MB segments. Testing has
  * shown that segments that are greater than or equal to 16MB do not suffer
  * from drastic performance problems. Using this value, we derive the rest
  * of the table. Since the fragmentation value is never stored on disk, it
  * is possible to change these calculations in the future.
  */
 static const int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
 	100,	/* 512B	*/
 	100,	/* 1K	*/
 	98,	/* 2K	*/
 	95,	/* 4K	*/
 	90,	/* 8K	*/
 	80,	/* 16K	*/
 	70,	/* 32K	*/
 	60,	/* 64K	*/
 	50,	/* 128K	*/
 	40,	/* 256K	*/
 	30,	/* 512K	*/
 	20,	/* 1M	*/
 	15,	/* 2M	*/
 	10,	/* 4M	*/
 	5,	/* 8M	*/
 	0	/* 16M	*/
 };
 
 /*
  * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
  * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
  * been upgraded and does not support this metric. Otherwise, the return
  * value should be in the range [0, 100].
  */
 static void
 metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	uint64_t fragmentation = 0;
 	uint64_t total = 0;
 	boolean_t feature_enabled = spa_feature_is_enabled(spa,
 	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
 
 	if (!feature_enabled) {
 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
 		return;
 	}
 
 	/*
 	 * A null space map means that the entire metaslab is free
 	 * and thus is not fragmented.
 	 */
 	if (msp->ms_sm == NULL) {
 		msp->ms_fragmentation = 0;
 		return;
 	}
 
 	/*
 	 * If this metaslab's space map has not been upgraded, flag it
 	 * so that we upgrade next time we encounter it.
 	 */
 	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
 		uint64_t txg = spa_syncing_txg(spa);
 		vdev_t *vd = msp->ms_group->mg_vd;
 
 		/*
 		 * If we've reached the final dirty txg, then we must
 		 * be shutting down the pool. We don't want to dirty
 		 * any data past this point so skip setting the condense
 		 * flag. We can retry this action the next time the pool
 		 * is imported. We also skip marking this metaslab for
 		 * condensing if the caller has explicitly set nodirty.
 		 */
 		if (!nodirty &&
 		    spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
 			msp->ms_condense_wanted = B_TRUE;
 			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 			zfs_dbgmsg("txg %llu, requesting force condense: "
 			    "ms_id %llu, vdev_id %llu", (u_longlong_t)txg,
 			    (u_longlong_t)msp->ms_id,
 			    (u_longlong_t)vd->vdev_id);
 		}
 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
 		return;
 	}
 
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		uint64_t space = 0;
 		uint8_t shift = msp->ms_sm->sm_shift;
 
 		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
 		    FRAGMENTATION_TABLE_SIZE - 1);
 
 		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
 			continue;
 
 		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
 		total += space;
 
 		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
 		fragmentation += space * zfs_frag_table[idx];
 	}
 
 	if (total > 0)
 		fragmentation /= total;
 	ASSERT3U(fragmentation, <=, 100);
 
 	msp->ms_fragmentation = fragmentation;
 }
 
 /*
  * Compute a weight -- a selection preference value -- for the given metaslab.
  * This is based on the amount of free space, the level of fragmentation,
  * the LBA range, and whether the metaslab is loaded.
  */
 static uint64_t
 metaslab_space_weight(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	uint64_t weight, space;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * The baseline weight is the metaslab's free space.
 	 */
 	space = msp->ms_size - metaslab_allocated_space(msp);
 
 	if (metaslab_fragmentation_factor_enabled &&
 	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
 		/*
 		 * Use the fragmentation information to inversely scale
 		 * down the baseline weight. We need to ensure that we
 		 * don't exclude this metaslab completely when it's 100%
 		 * fragmented. To avoid this we reduce the fragmented value
 		 * by 1.
 		 */
 		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
 
 		/*
 		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
 		 * this metaslab again. The fragmentation metric may have
 		 * decreased the space to something smaller than
 		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
 		 * so that we can consume any remaining space.
 		 */
 		if (space > 0 && space < SPA_MINBLOCKSIZE)
 			space = SPA_MINBLOCKSIZE;
 	}
 	weight = space;
 
 	/*
 	 * Modern disks have uniform bit density and constant angular velocity.
 	 * Therefore, the outer recording zones are faster (higher bandwidth)
 	 * than the inner zones by the ratio of outer to inner track diameter,
 	 * which is typically around 2:1.  We account for this by assigning
 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
 	 * In effect, this means that we'll select the metaslab with the most
 	 * free bandwidth rather than simply the one with the most free space.
 	 */
 	if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
 		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
 		ASSERT(weight >= space && weight <= 2 * space);
 	}
 
 	/*
 	 * If this metaslab is one we're actively using, adjust its
 	 * weight to make it preferable to any inactive metaslab so
 	 * we'll polish it off. If the fragmentation on this metaslab
 	 * has exceed our threshold, then don't mark it active.
 	 */
 	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
 	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
 	}
 
 	WEIGHT_SET_SPACEBASED(weight);
 	return (weight);
 }
 
 /*
  * Return the weight of the specified metaslab, according to the segment-based
  * weighting algorithm. The metaslab must be loaded. This function can
  * be called within a sync pass since it relies only on the metaslab's
  * range tree which is always accurate when the metaslab is loaded.
  */
 static uint64_t
 metaslab_weight_from_range_tree(metaslab_t *msp)
 {
 	uint64_t weight = 0;
 	uint32_t segments = 0;
 
 	ASSERT(msp->ms_loaded);
 
 	for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
 	    i--) {
 		uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
 
 		segments <<= 1;
 		segments += msp->ms_allocatable->rt_histogram[i];
 
 		/*
 		 * The range tree provides more precision than the space map
 		 * and must be downgraded so that all values fit within the
 		 * space map's histogram. This allows us to compare loaded
 		 * vs. unloaded metaslabs to determine which metaslab is
 		 * considered "best".
 		 */
 		if (i > max_idx)
 			continue;
 
 		if (segments != 0) {
 			WEIGHT_SET_COUNT(weight, segments);
 			WEIGHT_SET_INDEX(weight, i);
 			WEIGHT_SET_ACTIVE(weight, 0);
 			break;
 		}
 	}
 	return (weight);
 }
 
 /*
  * Calculate the weight based on the on-disk histogram. Should be applied
  * only to unloaded metaslabs  (i.e no incoming allocations) in-order to
  * give results consistent with the on-disk state
  */
 static uint64_t
 metaslab_weight_from_spacemap(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	ASSERT(!msp->ms_loaded);
 	ASSERT(sm != NULL);
 	ASSERT3U(space_map_object(sm), !=, 0);
 	ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
 
 	/*
 	 * Create a joint histogram from all the segments that have made
 	 * it to the metaslab's space map histogram, that are not yet
 	 * available for allocation because they are still in the freeing
 	 * pipeline (e.g. freeing, freed, and defer trees). Then subtract
 	 * these segments from the space map's histogram to get a more
 	 * accurate weight.
 	 */
 	uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 		deferspace_histogram[i] += msp->ms_synchist[i];
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 			deferspace_histogram[i] += msp->ms_deferhist[t][i];
 		}
 	}
 
 	uint64_t weight = 0;
 	for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
 		ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
 		    deferspace_histogram[i]);
 		uint64_t count =
 		    sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
 		if (count != 0) {
 			WEIGHT_SET_COUNT(weight, count);
 			WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
 			WEIGHT_SET_ACTIVE(weight, 0);
 			break;
 		}
 	}
 	return (weight);
 }
 
 /*
  * Compute a segment-based weight for the specified metaslab. The weight
  * is determined by highest bucket in the histogram. The information
  * for the highest bucket is encoded into the weight value.
  */
 static uint64_t
 metaslab_segment_weight(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	uint64_t weight = 0;
 	uint8_t shift = mg->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * The metaslab is completely free.
 	 */
 	if (metaslab_allocated_space(msp) == 0) {
 		int idx = highbit64(msp->ms_size) - 1;
 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
 
 		if (idx < max_idx) {
 			WEIGHT_SET_COUNT(weight, 1ULL);
 			WEIGHT_SET_INDEX(weight, idx);
 		} else {
 			WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
 			WEIGHT_SET_INDEX(weight, max_idx);
 		}
 		WEIGHT_SET_ACTIVE(weight, 0);
 		ASSERT(!WEIGHT_IS_SPACEBASED(weight));
 		return (weight);
 	}
 
 	ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
 
 	/*
 	 * If the metaslab is fully allocated then just make the weight 0.
 	 */
 	if (metaslab_allocated_space(msp) == msp->ms_size)
 		return (0);
 	/*
 	 * If the metaslab is already loaded, then use the range tree to
 	 * determine the weight. Otherwise, we rely on the space map information
 	 * to generate the weight.
 	 */
 	if (msp->ms_loaded) {
 		weight = metaslab_weight_from_range_tree(msp);
 	} else {
 		weight = metaslab_weight_from_spacemap(msp);
 	}
 
 	/*
 	 * If the metaslab was active the last time we calculated its weight
 	 * then keep it active. We want to consume the entire region that
 	 * is associated with this weight.
 	 */
 	if (msp->ms_activation_weight != 0 && weight != 0)
 		WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
 	return (weight);
 }
 
 /*
  * Determine if we should attempt to allocate from this metaslab. If the
  * metaslab is loaded, then we can determine if the desired allocation
  * can be satisfied by looking at the size of the maximum free segment
  * on that metaslab. Otherwise, we make our decision based on the metaslab's
  * weight. For segment-based weighting we can determine the maximum
  * allocation based on the index encoded in its value. For space-based
  * weights we rely on the entire weight (excluding the weight-type bit).
  */
 static boolean_t
 metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
 {
 	/*
 	 * This case will usually but not always get caught by the checks below;
 	 * metaslabs can be loaded by various means, including the trim and
 	 * initialize code. Once that happens, without this check they are
 	 * allocatable even before they finish their first txg sync.
 	 */
 	if (unlikely(msp->ms_new))
 		return (B_FALSE);
 
 	/*
 	 * If the metaslab is loaded, ms_max_size is definitive and we can use
 	 * the fast check. If it's not, the ms_max_size is a lower bound (once
 	 * set), and we should use the fast check as long as we're not in
 	 * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
 	 * seconds since the metaslab was unloaded.
 	 */
 	if (msp->ms_loaded ||
 	    (msp->ms_max_size != 0 && !try_hard && gethrtime() <
 	    msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
 		return (msp->ms_max_size >= asize);
 
 	boolean_t should_allocate;
 	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
 		/*
 		 * The metaslab segment weight indicates segments in the
 		 * range [2^i, 2^(i+1)), where i is the index in the weight.
 		 * Since the asize might be in the middle of the range, we
 		 * should attempt the allocation if asize < 2^(i+1).
 		 */
 		should_allocate = (asize <
 		    1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
 	} else {
 		should_allocate = (asize <=
 		    (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
 	}
 
 	return (should_allocate);
 }
 
 static uint64_t
 metaslab_weight(metaslab_t *msp, boolean_t nodirty)
 {
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	uint64_t weight;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	metaslab_set_fragmentation(msp, nodirty);
 
 	/*
 	 * Update the maximum size. If the metaslab is loaded, this will
 	 * ensure that we get an accurate maximum size if newly freed space
 	 * has been added back into the free tree. If the metaslab is
 	 * unloaded, we check if there's a larger free segment in the
 	 * unflushed frees. This is a lower bound on the largest allocatable
 	 * segment size. Coalescing of adjacent entries may reveal larger
 	 * allocatable segments, but we aren't aware of those until loading
 	 * the space map into a range tree.
 	 */
 	if (msp->ms_loaded) {
 		msp->ms_max_size = metaslab_largest_allocatable(msp);
 	} else {
 		msp->ms_max_size = MAX(msp->ms_max_size,
 		    metaslab_largest_unflushed_free(msp));
 	}
 
 	/*
 	 * Segment-based weighting requires space map histogram support.
 	 */
 	if (zfs_metaslab_segment_weight_enabled &&
 	    spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
 	    (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
 	    sizeof (space_map_phys_t))) {
 		weight = metaslab_segment_weight(msp);
 	} else {
 		weight = metaslab_space_weight(msp);
 	}
 	return (weight);
 }
 
 void
 metaslab_recalculate_weight_and_sort(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/* note: we preserve the mask (e.g. indication of primary, etc..) */
 	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 	metaslab_group_sort(msp->ms_group, msp,
 	    metaslab_weight(msp, B_FALSE) | was_active);
 }
 
 static int
 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
     int allocator, uint64_t activation_weight)
 {
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * If we're activating for the claim code, we don't want to actually
 	 * set the metaslab up for a specific allocator.
 	 */
 	if (activation_weight == METASLAB_WEIGHT_CLAIM) {
 		ASSERT0(msp->ms_activation_weight);
 		msp->ms_activation_weight = msp->ms_weight;
 		metaslab_group_sort(mg, msp, msp->ms_weight |
 		    activation_weight);
 		return (0);
 	}
 
 	metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
 	    &mga->mga_primary : &mga->mga_secondary);
 
 	mutex_enter(&mg->mg_lock);
 	if (*mspp != NULL) {
 		mutex_exit(&mg->mg_lock);
 		return (EEXIST);
 	}
 
 	*mspp = msp;
 	ASSERT3S(msp->ms_allocator, ==, -1);
 	msp->ms_allocator = allocator;
 	msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
 
 	ASSERT0(msp->ms_activation_weight);
 	msp->ms_activation_weight = msp->ms_weight;
 	metaslab_group_sort_impl(mg, msp,
 	    msp->ms_weight | activation_weight);
 	mutex_exit(&mg->mg_lock);
 
 	return (0);
 }
 
 static int
 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * The current metaslab is already activated for us so there
 	 * is nothing to do. Already activated though, doesn't mean
 	 * that this metaslab is activated for our allocator nor our
 	 * requested activation weight. The metaslab could have started
 	 * as an active one for our allocator but changed allocators
 	 * while we were waiting to grab its ms_lock or we stole it
 	 * [see find_valid_metaslab()]. This means that there is a
 	 * possibility of passivating a metaslab of another allocator
 	 * or from a different activation mask, from this thread.
 	 */
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
 		ASSERT(msp->ms_loaded);
 		return (0);
 	}
 
 	int error = metaslab_load(msp);
 	if (error != 0) {
 		metaslab_group_sort(msp->ms_group, msp, 0);
 		return (error);
 	}
 
 	/*
 	 * When entering metaslab_load() we may have dropped the
 	 * ms_lock because we were loading this metaslab, or we
 	 * were waiting for another thread to load it for us. In
 	 * that scenario, we recheck the weight of the metaslab
 	 * to see if it was activated by another thread.
 	 *
 	 * If the metaslab was activated for another allocator or
 	 * it was activated with a different activation weight (e.g.
 	 * we wanted to make it a primary but it was activated as
 	 * secondary) we return error (EBUSY).
 	 *
 	 * If the metaslab was activated for the same allocator
 	 * and requested activation mask, skip activating it.
 	 */
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
 		if (msp->ms_allocator != allocator)
 			return (EBUSY);
 
 		if ((msp->ms_weight & activation_weight) == 0)
 			return (SET_ERROR(EBUSY));
 
 		EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY),
 		    msp->ms_primary);
 		return (0);
 	}
 
 	/*
 	 * If the metaslab has literally 0 space, it will have weight 0. In
 	 * that case, don't bother activating it. This can happen if the
 	 * metaslab had space during find_valid_metaslab, but another thread
 	 * loaded it and used all that space while we were waiting to grab the
 	 * lock.
 	 */
 	if (msp->ms_weight == 0) {
 		ASSERT0(range_tree_space(msp->ms_allocatable));
 		return (SET_ERROR(ENOSPC));
 	}
 
 	if ((error = metaslab_activate_allocator(msp->ms_group, msp,
 	    allocator, activation_weight)) != 0) {
 		return (error);
 	}
 
 	ASSERT(msp->ms_loaded);
 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 
 	return (0);
 }
 
 static void
 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
     uint64_t weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 
 	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
 		metaslab_group_sort(mg, msp, weight);
 		return;
 	}
 
 	mutex_enter(&mg->mg_lock);
 	ASSERT3P(msp->ms_group, ==, mg);
 	ASSERT3S(0, <=, msp->ms_allocator);
 	ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
 
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator];
 	if (msp->ms_primary) {
 		ASSERT3P(mga->mga_primary, ==, msp);
 		ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
 		mga->mga_primary = NULL;
 	} else {
 		ASSERT3P(mga->mga_secondary, ==, msp);
 		ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 		mga->mga_secondary = NULL;
 	}
 	msp->ms_allocator = -1;
 	metaslab_group_sort_impl(mg, msp, weight);
 	mutex_exit(&mg->mg_lock);
 }
 
 static void
 metaslab_passivate(metaslab_t *msp, uint64_t weight)
 {
 	uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE;
 
 	/*
 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
 	 * this metaslab again.  In that case, it had better be empty,
 	 * or we would be leaving space on the table.
 	 */
 	ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
 	    size >= SPA_MINBLOCKSIZE ||
 	    range_tree_space(msp->ms_allocatable) == 0);
 	ASSERT0(weight & METASLAB_ACTIVE_MASK);
 
 	ASSERT(msp->ms_activation_weight != 0);
 	msp->ms_activation_weight = 0;
 	metaslab_passivate_allocator(msp->ms_group, msp, weight);
 	ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
 }
 
 /*
  * Segment-based metaslabs are activated once and remain active until
  * we either fail an allocation attempt (similar to space-based metaslabs)
  * or have exhausted the free space in zfs_metaslab_switch_threshold
  * buckets since the metaslab was activated. This function checks to see
  * if we've exhausted the zfs_metaslab_switch_threshold buckets in the
  * metaslab and passivates it proactively. This will allow us to select a
  * metaslab with a larger contiguous region, if any, remaining within this
  * metaslab group. If we're in sync pass > 1, then we continue using this
  * metaslab so that we don't dirty more block and cause more sync passes.
  */
 static void
 metaslab_segment_may_passivate(metaslab_t *msp)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
 		return;
 
 	/*
 	 * Since we are in the middle of a sync pass, the most accurate
 	 * information that is accessible to us is the in-core range tree
 	 * histogram; calculate the new weight based on that information.
 	 */
 	uint64_t weight = metaslab_weight_from_range_tree(msp);
 	int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
 	int current_idx = WEIGHT_GET_INDEX(weight);
 
 	if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
 		metaslab_passivate(msp, weight);
 }
 
 static void
 metaslab_preload(void *arg)
 {
 	metaslab_t *msp = arg;
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 	spa_t *spa = mc->mc_spa;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
 
 	mutex_enter(&msp->ms_lock);
 	(void) metaslab_load(msp);
 	metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
 	mutex_exit(&msp->ms_lock);
 	spl_fstrans_unmark(cookie);
 }
 
 static void
 metaslab_group_preload(metaslab_group_t *mg)
 {
 	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_t *msp;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	int m = 0;
 
 	if (spa_shutting_down(spa) || !metaslab_preload_enabled)
 		return;
 
 	mutex_enter(&mg->mg_lock);
 
 	/*
 	 * Load the next potential metaslabs
 	 */
 	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
 		ASSERT3P(msp->ms_group, ==, mg);
 
 		/*
 		 * We preload only the maximum number of metaslabs specified
 		 * by metaslab_preload_limit. If a metaslab is being forced
 		 * to condense then we preload it too. This will ensure
 		 * that force condensing happens in the next txg.
 		 */
 		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
 			continue;
 		}
 
 		VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload,
 		    msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0))
 		    != TASKQID_INVALID);
 	}
 	mutex_exit(&mg->mg_lock);
 }
 
 /*
  * Determine if the space map's on-disk footprint is past our tolerance for
  * inefficiency. We would like to use the following criteria to make our
  * decision:
  *
  * 1. Do not condense if the size of the space map object would dramatically
  *    increase as a result of writing out the free space range tree.
  *
  * 2. Condense if the on on-disk space map representation is at least
  *    zfs_condense_pct/100 times the size of the optimal representation
  *    (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
  *
  * 3. Do not condense if the on-disk size of the space map does not actually
  *    decrease.
  *
  * Unfortunately, we cannot compute the on-disk size of the space map in this
  * context because we cannot accurately compute the effects of compression, etc.
  * Instead, we apply the heuristic described in the block comment for
  * zfs_metaslab_condense_block_threshold - we only condense if the space used
  * is greater than a threshold number of blocks.
  */
 static boolean_t
 metaslab_should_condense(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	vdev_t *vd = msp->ms_group->mg_vd;
 	uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 	ASSERT(sm != NULL);
 	ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
 
 	/*
 	 * We always condense metaslabs that are empty and metaslabs for
 	 * which a condense request has been made.
 	 */
 	if (range_tree_numsegs(msp->ms_allocatable) == 0 ||
 	    msp->ms_condense_wanted)
 		return (B_TRUE);
 
 	uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
 	uint64_t object_size = space_map_length(sm);
 	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
 	    msp->ms_allocatable, SM_NO_VDEVID);
 
 	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
 }
 
 /*
  * Condense the on-disk space map representation to its minimized form.
  * The minimized form consists of a small number of allocations followed
  * by the entries of the free range tree (ms_allocatable). The condensed
  * spacemap contains all the entries of previous TXGs (including those in
  * the pool-wide log spacemaps; thus this is effectively a superset of
  * metaslab_flush()), but this TXG's entries still need to be written.
  */
 static void
 metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
 {
 	range_tree_t *condense_tree;
 	space_map_t *sm = msp->ms_sm;
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 	ASSERT(msp->ms_sm != NULL);
 
 	/*
 	 * In order to condense the space map, we need to change it so it
 	 * only describes which segments are currently allocated and free.
 	 *
 	 * All the current free space resides in the ms_allocatable, all
 	 * the ms_defer trees, and all the ms_allocating trees. We ignore
 	 * ms_freed because it is empty because we're in sync pass 1. We
 	 * ignore ms_freeing because these changes are not yet reflected
 	 * in the spacemap (they will be written later this txg).
 	 *
 	 * So to truncate the space map to represent all the entries of
 	 * previous TXGs we do the following:
 	 *
 	 * 1] We create a range tree (condense tree) that is 100% empty.
 	 * 2] We add to it all segments found in the ms_defer trees
 	 *    as those segments are marked as free in the original space
 	 *    map. We do the same with the ms_allocating trees for the same
 	 *    reason. Adding these segments should be a relatively
 	 *    inexpensive operation since we expect these trees to have a
 	 *    small number of nodes.
 	 * 3] We vacate any unflushed allocs, since they are not frees we
 	 *    need to add to the condense tree. Then we vacate any
 	 *    unflushed frees as they should already be part of ms_allocatable.
 	 * 4] At this point, we would ideally like to add all segments
 	 *    in the ms_allocatable tree from the condense tree. This way
 	 *    we would write all the entries of the condense tree as the
 	 *    condensed space map, which would only contain freed
 	 *    segments with everything else assumed to be allocated.
 	 *
 	 *    Doing so can be prohibitively expensive as ms_allocatable can
 	 *    be large, and therefore computationally expensive to add to
 	 *    the condense_tree. Instead we first sync out an entry marking
 	 *    everything as allocated, then the condense_tree and then the
 	 *    ms_allocatable, in the condensed space map. While this is not
 	 *    optimal, it is typically close to optimal and more importantly
 	 *    much cheaper to compute.
 	 *
 	 * 5] Finally, as both of the unflushed trees were written to our
 	 *    new and condensed metaslab space map, we basically flushed
 	 *    all the unflushed changes to disk, thus we call
 	 *    metaslab_flush_update().
 	 */
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 	ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
 
 	zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, "
 	    "spa %s, smp size %llu, segments %llu, forcing condense=%s",
 	    (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp,
 	    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 	    spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm),
 	    (u_longlong_t)range_tree_numsegs(msp->ms_allocatable),
 	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
 
 	msp->ms_condense_wanted = B_FALSE;
 
 	range_seg_type_t type;
 	uint64_t shift, start;
 	type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
 	    &start, &shift);
 
 	condense_tree = range_tree_create(NULL, type, NULL, start, shift);
 
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		range_tree_walk(msp->ms_defer[t],
 		    range_tree_add, condense_tree);
 	}
 
 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 		range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
 		    range_tree_add, condense_tree);
 	}
 
 	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 	    metaslab_unflushed_changes_memused(msp));
 	spa->spa_unflushed_stats.sus_memused -=
 	    metaslab_unflushed_changes_memused(msp);
 	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
 	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
 
 	/*
 	 * We're about to drop the metaslab's lock thus allowing other
 	 * consumers to change it's content. Set the metaslab's ms_condensing
 	 * flag to ensure that allocations on this metaslab do not occur
 	 * while we're in the middle of committing it to disk. This is only
 	 * critical for ms_allocatable as all other range trees use per TXG
 	 * views of their content.
 	 */
 	msp->ms_condensing = B_TRUE;
 
 	mutex_exit(&msp->ms_lock);
 	uint64_t object = space_map_object(msp->ms_sm);
 	space_map_truncate(sm,
 	    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
 	    zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
 
 	/*
 	 * space_map_truncate() may have reallocated the spacemap object.
 	 * If so, update the vdev_ms_array.
 	 */
 	if (space_map_object(msp->ms_sm) != object) {
 		object = space_map_object(msp->ms_sm);
 		dmu_write(spa->spa_meta_objset,
 		    msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
 		    msp->ms_id, sizeof (uint64_t), &object, tx);
 	}
 
 	/*
 	 * Note:
 	 * When the log space map feature is enabled, each space map will
 	 * always have ALLOCS followed by FREES for each sync pass. This is
 	 * typically true even when the log space map feature is disabled,
 	 * except from the case where a metaslab goes through metaslab_sync()
 	 * and gets condensed. In that case the metaslab's space map will have
 	 * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
 	 * followed by FREES (due to space_map_write() in metaslab_sync()) for
 	 * sync pass 1.
 	 */
 	range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start,
 	    shift);
 	range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
 	space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
 	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
 	space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx);
 
 	range_tree_vacate(condense_tree, NULL, NULL);
 	range_tree_destroy(condense_tree);
 	range_tree_vacate(tmp_tree, NULL, NULL);
 	range_tree_destroy(tmp_tree);
 	mutex_enter(&msp->ms_lock);
 
 	msp->ms_condensing = B_FALSE;
 	metaslab_flush_update(msp, tx);
 }
 
 static void
 metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	ASSERT(spa_syncing_log_sm(spa) != NULL);
 	ASSERT(msp->ms_sm != NULL);
 	ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
 	ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 
 	mutex_enter(&spa->spa_flushed_ms_lock);
 	metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
 	metaslab_set_unflushed_dirty(msp, B_TRUE);
 	avl_add(&spa->spa_metaslabs_by_flushed, msp);
 	mutex_exit(&spa->spa_flushed_ms_lock);
 
 	spa_log_sm_increment_current_mscount(spa);
 	spa_log_summary_add_flushed_metaslab(spa, B_TRUE);
 }
 
 void
 metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	ASSERT(spa_syncing_log_sm(spa) != NULL);
 	ASSERT(msp->ms_sm != NULL);
 	ASSERT(metaslab_unflushed_txg(msp) != 0);
 	ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
 	ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
 	ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 
 	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
 
 	/* update metaslab's position in our flushing tree */
 	uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
 	boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp);
 	mutex_enter(&spa->spa_flushed_ms_lock);
 	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
 	metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
 	metaslab_set_unflushed_dirty(msp, dirty);
 	avl_add(&spa->spa_metaslabs_by_flushed, msp);
 	mutex_exit(&spa->spa_flushed_ms_lock);
 
 	/* update metaslab counts of spa_log_sm_t nodes */
 	spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
 	spa_log_sm_increment_current_mscount(spa);
 
 	/* update log space map summary */
 	spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg,
 	    ms_prev_flushed_dirty);
 	spa_log_summary_add_flushed_metaslab(spa, dirty);
 
 	/* cleanup obsolete logs if any */
 	spa_cleanup_old_sm_logs(spa, tx);
 }
 
 /*
  * Called when the metaslab has been flushed (its own spacemap now reflects
  * all the contents of the pool-wide spacemap log). Updates the metaslab's
  * metadata and any pool-wide related log space map data (e.g. summary,
  * obsolete logs, etc..) to reflect that.
  */
 static void
 metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	spa_t *spa = mg->mg_vd->vdev_spa;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 
 	/*
 	 * Just because a metaslab got flushed, that doesn't mean that
 	 * it will pass through metaslab_sync_done(). Thus, make sure to
 	 * update ms_synced_length here in case it doesn't.
 	 */
 	msp->ms_synced_length = space_map_length(msp->ms_sm);
 
 	/*
 	 * We may end up here from metaslab_condense() without the
 	 * feature being active. In that case this is a no-op.
 	 */
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) ||
 	    metaslab_unflushed_txg(msp) == 0)
 		return;
 
 	metaslab_unflushed_bump(msp, tx, B_FALSE);
 }
 
 boolean_t
 metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	ASSERT(msp->ms_sm != NULL);
 	ASSERT(metaslab_unflushed_txg(msp) != 0);
 	ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
 
 	/*
 	 * There is nothing wrong with flushing the same metaslab twice, as
 	 * this codepath should work on that case. However, the current
 	 * flushing scheme makes sure to avoid this situation as we would be
 	 * making all these calls without having anything meaningful to write
 	 * to disk. We assert this behavior here.
 	 */
 	ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
 
 	/*
 	 * We can not flush while loading, because then we would
 	 * not load the ms_unflushed_{allocs,frees}.
 	 */
 	if (msp->ms_loading)
 		return (B_FALSE);
 
 	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 	metaslab_verify_weight_and_frag(msp);
 
 	/*
 	 * Metaslab condensing is effectively flushing. Therefore if the
 	 * metaslab can be condensed we can just condense it instead of
 	 * flushing it.
 	 *
 	 * Note that metaslab_condense() does call metaslab_flush_update()
 	 * so we can just return immediately after condensing. We also
 	 * don't need to care about setting ms_flushing or broadcasting
 	 * ms_flush_cv, even if we temporarily drop the ms_lock in
 	 * metaslab_condense(), as the metaslab is already loaded.
 	 */
 	if (msp->ms_loaded && metaslab_should_condense(msp)) {
 		metaslab_group_t *mg = msp->ms_group;
 
 		/*
 		 * For all histogram operations below refer to the
 		 * comments of metaslab_sync() where we follow a
 		 * similar procedure.
 		 */
 		metaslab_group_histogram_verify(mg);
 		metaslab_class_histogram_verify(mg->mg_class);
 		metaslab_group_histogram_remove(mg, msp);
 
 		metaslab_condense(msp, tx);
 
 		space_map_histogram_clear(msp->ms_sm);
 		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
 		ASSERT(range_tree_is_empty(msp->ms_freed));
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			space_map_histogram_add(msp->ms_sm,
 			    msp->ms_defer[t], tx);
 		}
 		metaslab_aux_histograms_update(msp);
 
 		metaslab_group_histogram_add(mg, msp);
 		metaslab_group_histogram_verify(mg);
 		metaslab_class_histogram_verify(mg->mg_class);
 
 		metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 
 		/*
 		 * Since we recreated the histogram (and potentially
 		 * the ms_sm too while condensing) ensure that the
 		 * weight is updated too because we are not guaranteed
 		 * that this metaslab is dirty and will go through
 		 * metaslab_sync_done().
 		 */
 		metaslab_recalculate_weight_and_sort(msp);
 		return (B_TRUE);
 	}
 
 	msp->ms_flushing = B_TRUE;
 	uint64_t sm_len_before = space_map_length(msp->ms_sm);
 
 	mutex_exit(&msp->ms_lock);
 	space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
 	    SM_NO_VDEVID, tx);
 	space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
 	    SM_NO_VDEVID, tx);
 	mutex_enter(&msp->ms_lock);
 
 	uint64_t sm_len_after = space_map_length(msp->ms_sm);
 	if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
 		zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
 		    "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
 		    "appended %llu bytes", (u_longlong_t)dmu_tx_get_txg(tx),
 		    spa_name(spa),
 		    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 		    (u_longlong_t)msp->ms_id,
 		    (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs),
 		    (u_longlong_t)range_tree_space(msp->ms_unflushed_frees),
 		    (u_longlong_t)(sm_len_after - sm_len_before));
 	}
 
 	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 	    metaslab_unflushed_changes_memused(msp));
 	spa->spa_unflushed_stats.sus_memused -=
 	    metaslab_unflushed_changes_memused(msp);
 	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
 	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
 
 	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 	metaslab_verify_weight_and_frag(msp);
 
 	metaslab_flush_update(msp, tx);
 
 	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 	metaslab_verify_weight_and_frag(msp);
 
 	msp->ms_flushing = B_FALSE;
 	cv_broadcast(&msp->ms_flush_cv);
 	return (B_TRUE);
 }
 
 /*
  * Write a metaslab to disk in the context of the specified transaction group.
  */
 void
 metaslab_sync(metaslab_t *msp, uint64_t txg)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
 	range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
 	dmu_tx_t *tx;
 
 	ASSERT(!vd->vdev_ishole);
 
 	/*
 	 * This metaslab has just been added so there's no work to do now.
 	 */
 	if (msp->ms_new) {
 		ASSERT0(range_tree_space(alloctree));
 		ASSERT0(range_tree_space(msp->ms_freeing));
 		ASSERT0(range_tree_space(msp->ms_freed));
 		ASSERT0(range_tree_space(msp->ms_checkpointing));
 		ASSERT0(range_tree_space(msp->ms_trim));
 		return;
 	}
 
 	/*
 	 * Normally, we don't want to process a metaslab if there are no
 	 * allocations or frees to perform. However, if the metaslab is being
 	 * forced to condense, it's loaded and we're not beyond the final
 	 * dirty txg, we need to let it through. Not condensing beyond the
 	 * final dirty txg prevents an issue where metaslabs that need to be
 	 * condensed but were loaded for other reasons could cause a panic
 	 * here. By only checking the txg in that branch of the conditional,
 	 * we preserve the utility of the VERIFY statements in all other
 	 * cases.
 	 */
 	if (range_tree_is_empty(alloctree) &&
 	    range_tree_is_empty(msp->ms_freeing) &&
 	    range_tree_is_empty(msp->ms_checkpointing) &&
 	    !(msp->ms_loaded && msp->ms_condense_wanted &&
 	    txg <= spa_final_dirty_txg(spa)))
 		return;
 
 
 	VERIFY3U(txg, <=, spa_final_dirty_txg(spa));
 
 	/*
 	 * The only state that can actually be changing concurrently
 	 * with metaslab_sync() is the metaslab's ms_allocatable. No
 	 * other thread can be modifying this txg's alloc, freeing,
 	 * freed, or space_map_phys_t.  We drop ms_lock whenever we
 	 * could call into the DMU, because the DMU can call down to
 	 * us (e.g. via zio_free()) at any time.
 	 *
 	 * The spa_vdev_remove_thread() can be reading metaslab state
 	 * concurrently, and it is locked out by the ms_sync_lock.
 	 * Note that the ms_lock is insufficient for this, because it
 	 * is dropped by space_map_write().
 	 */
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	/*
 	 * Generate a log space map if one doesn't exist already.
 	 */
 	spa_generate_syncing_log_sm(spa, tx);
 
 	if (msp->ms_sm == NULL) {
 		uint64_t new_object = space_map_alloc(mos,
 		    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
 		    zfs_metaslab_sm_blksz_with_log :
 		    zfs_metaslab_sm_blksz_no_log, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
 		    msp->ms_id, sizeof (uint64_t), &new_object, tx);
 
 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
 		    msp->ms_start, msp->ms_size, vd->vdev_ashift));
 		ASSERT(msp->ms_sm != NULL);
 
 		ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
 		ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 		ASSERT0(metaslab_allocated_space(msp));
 	}
 
 	if (!range_tree_is_empty(msp->ms_checkpointing) &&
 	    vd->vdev_checkpoint_sm == NULL) {
 		ASSERT(spa_has_checkpoint(spa));
 
 		uint64_t new_object = space_map_alloc(mos,
 		    zfs_vdev_standard_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
 		    mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
 		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 
 		/*
 		 * We save the space map object as an entry in vdev_top_zap
 		 * so it can be retrieved when the pool is reopened after an
 		 * export or through zdb.
 		 */
 		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
 		    sizeof (new_object), 1, &new_object, tx));
 	}
 
 	mutex_enter(&msp->ms_sync_lock);
 	mutex_enter(&msp->ms_lock);
 
 	/*
 	 * Note: metaslab_condense() clears the space map's histogram.
 	 * Therefore we must verify and remove this histogram before
 	 * condensing.
 	 */
 	metaslab_group_histogram_verify(mg);
 	metaslab_class_histogram_verify(mg->mg_class);
 	metaslab_group_histogram_remove(mg, msp);
 
 	if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
 	    metaslab_should_condense(msp))
 		metaslab_condense(msp, tx);
 
 	/*
 	 * We'll be going to disk to sync our space accounting, thus we
 	 * drop the ms_lock during that time so allocations coming from
 	 * open-context (ZIL) for future TXGs do not block.
 	 */
 	mutex_exit(&msp->ms_lock);
 	space_map_t *log_sm = spa_syncing_log_sm(spa);
 	if (log_sm != NULL) {
 		ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
 		if (metaslab_unflushed_txg(msp) == 0)
 			metaslab_unflushed_add(msp, tx);
 		else if (!metaslab_unflushed_dirty(msp))
 			metaslab_unflushed_bump(msp, tx, B_TRUE);
 
 		space_map_write(log_sm, alloctree, SM_ALLOC,
 		    vd->vdev_id, tx);
 		space_map_write(log_sm, msp->ms_freeing, SM_FREE,
 		    vd->vdev_id, tx);
 		mutex_enter(&msp->ms_lock);
 
 		ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 		    metaslab_unflushed_changes_memused(msp));
 		spa->spa_unflushed_stats.sus_memused -=
 		    metaslab_unflushed_changes_memused(msp);
 		range_tree_remove_xor_add(alloctree,
 		    msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
 		range_tree_remove_xor_add(msp->ms_freeing,
 		    msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
 		spa->spa_unflushed_stats.sus_memused +=
 		    metaslab_unflushed_changes_memused(msp);
 	} else {
 		ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
 		    SM_NO_VDEVID, tx);
 		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
 		    SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 	}
 
 	msp->ms_allocated_space += range_tree_space(alloctree);
 	ASSERT3U(msp->ms_allocated_space, >=,
 	    range_tree_space(msp->ms_freeing));
 	msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
 
 	if (!range_tree_is_empty(msp->ms_checkpointing)) {
 		ASSERT(spa_has_checkpoint(spa));
 		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 
 		/*
 		 * Since we are doing writes to disk and the ms_checkpointing
 		 * tree won't be changing during that time, we drop the
 		 * ms_lock while writing to the checkpoint space map, for the
 		 * same reason mentioned above.
 		 */
 		mutex_exit(&msp->ms_lock);
 		space_map_write(vd->vdev_checkpoint_sm,
 		    msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 
 		spa->spa_checkpoint_info.sci_dspace +=
 		    range_tree_space(msp->ms_checkpointing);
 		vd->vdev_stat.vs_checkpoint_space +=
 		    range_tree_space(msp->ms_checkpointing);
 		ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
 		    -space_map_allocated(vd->vdev_checkpoint_sm));
 
 		range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
 	}
 
 	if (msp->ms_loaded) {
 		/*
 		 * When the space map is loaded, we have an accurate
 		 * histogram in the range tree. This gives us an opportunity
 		 * to bring the space map's histogram up-to-date so we clear
 		 * it first before updating it.
 		 */
 		space_map_histogram_clear(msp->ms_sm);
 		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
 
 		/*
 		 * Since we've cleared the histogram we need to add back
 		 * any free space that has already been processed, plus
 		 * any deferred space. This allows the on-disk histogram
 		 * to accurately reflect all free space even if some space
 		 * is not yet available for allocation (i.e. deferred).
 		 */
 		space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
 
 		/*
 		 * Add back any deferred free space that has not been
 		 * added back into the in-core free tree yet. This will
 		 * ensure that we don't end up with a space map histogram
 		 * that is completely empty unless the metaslab is fully
 		 * allocated.
 		 */
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			space_map_histogram_add(msp->ms_sm,
 			    msp->ms_defer[t], tx);
 		}
 	}
 
 	/*
 	 * Always add the free space from this sync pass to the space
 	 * map histogram. We want to make sure that the on-disk histogram
 	 * accounts for all free space. If the space map is not loaded,
 	 * then we will lose some accuracy but will correct it the next
 	 * time we load the space map.
 	 */
 	space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
 	metaslab_aux_histograms_update(msp);
 
 	metaslab_group_histogram_add(mg, msp);
 	metaslab_group_histogram_verify(mg);
 	metaslab_class_histogram_verify(mg->mg_class);
 
 	/*
 	 * For sync pass 1, we avoid traversing this txg's free range tree
 	 * and instead will just swap the pointers for freeing and freed.
 	 * We can safely do this since the freed_tree is guaranteed to be
 	 * empty on the initial pass.
 	 *
 	 * Keep in mind that even if we are currently using a log spacemap
 	 * we want current frees to end up in the ms_allocatable (but not
 	 * get appended to the ms_sm) so their ranges can be reused as usual.
 	 */
 	if (spa_sync_pass(spa) == 1) {
 		range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
 		ASSERT0(msp->ms_allocated_this_txg);
 	} else {
 		range_tree_vacate(msp->ms_freeing,
 		    range_tree_add, msp->ms_freed);
 	}
 	msp->ms_allocated_this_txg += range_tree_space(alloctree);
 	range_tree_vacate(alloctree, NULL, NULL);
 
 	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
 	ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
 	    & TXG_MASK]));
 	ASSERT0(range_tree_space(msp->ms_freeing));
 	ASSERT0(range_tree_space(msp->ms_checkpointing));
 
 	mutex_exit(&msp->ms_lock);
 
 	/*
 	 * Verify that the space map object ID has been recorded in the
 	 * vdev_ms_array.
 	 */
 	uint64_t object;
 	VERIFY0(dmu_read(mos, vd->vdev_ms_array,
 	    msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
 	VERIFY3U(object, ==, space_map_object(msp->ms_sm));
 
 	mutex_exit(&msp->ms_sync_lock);
 	dmu_tx_commit(tx);
 }
 
 static void
 metaslab_evict(metaslab_t *msp, uint64_t txg)
 {
 	if (!msp->ms_loaded || msp->ms_disabled != 0)
 		return;
 
 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
 		VERIFY0(range_tree_space(
 		    msp->ms_allocating[(txg + t) & TXG_MASK]));
 	}
 	if (msp->ms_allocator != -1)
 		metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
 
 	if (!metaslab_debug_unload)
 		metaslab_unload(msp);
 }
 
 /*
  * Called after a transaction group has completely synced to mark
  * all of the metaslab's free space as usable.
  */
 void
 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	range_tree_t **defer_tree;
 	int64_t alloc_delta, defer_delta;
 	boolean_t defer_allowed = B_TRUE;
 
 	ASSERT(!vd->vdev_ishole);
 
 	mutex_enter(&msp->ms_lock);
 
 	if (msp->ms_new) {
 		/* this is a new metaslab, add its capacity to the vdev */
 		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
 
 		/* there should be no allocations nor frees at this point */
 		VERIFY0(msp->ms_allocated_this_txg);
 		VERIFY0(range_tree_space(msp->ms_freed));
 	}
 
 	ASSERT0(range_tree_space(msp->ms_freeing));
 	ASSERT0(range_tree_space(msp->ms_checkpointing));
 
 	defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
 
 	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
 	    metaslab_class_get_alloc(spa_normal_class(spa));
 	if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing ||
 	    vd->vdev_rz_expanding) {
 		defer_allowed = B_FALSE;
 	}
 
 	defer_delta = 0;
 	alloc_delta = msp->ms_allocated_this_txg -
 	    range_tree_space(msp->ms_freed);
 
 	if (defer_allowed) {
 		defer_delta = range_tree_space(msp->ms_freed) -
 		    range_tree_space(*defer_tree);
 	} else {
 		defer_delta -= range_tree_space(*defer_tree);
 	}
 	metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
 	    defer_delta, 0);
 
 	if (spa_syncing_log_sm(spa) == NULL) {
 		/*
 		 * If there's a metaslab_load() in progress and we don't have
 		 * a log space map, it means that we probably wrote to the
 		 * metaslab's space map. If this is the case, we need to
 		 * make sure that we wait for the load to complete so that we
 		 * have a consistent view at the in-core side of the metaslab.
 		 */
 		metaslab_load_wait(msp);
 	} else {
 		ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 	}
 
 	/*
 	 * When auto-trimming is enabled, free ranges which are added to
 	 * ms_allocatable are also be added to ms_trim.  The ms_trim tree is
 	 * periodically consumed by the vdev_autotrim_thread() which issues
 	 * trims for all ranges and then vacates the tree.  The ms_trim tree
 	 * can be discarded at any time with the sole consequence of recent
 	 * frees not being trimmed.
 	 */
 	if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
 		range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
 		if (!defer_allowed) {
 			range_tree_walk(msp->ms_freed, range_tree_add,
 			    msp->ms_trim);
 		}
 	} else {
 		range_tree_vacate(msp->ms_trim, NULL, NULL);
 	}
 
 	/*
 	 * Move the frees from the defer_tree back to the free
 	 * range tree (if it's loaded). Swap the freed_tree and
 	 * the defer_tree -- this is safe to do because we've
 	 * just emptied out the defer_tree.
 	 */
 	range_tree_vacate(*defer_tree,
 	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
 	if (defer_allowed) {
 		range_tree_swap(&msp->ms_freed, defer_tree);
 	} else {
 		range_tree_vacate(msp->ms_freed,
 		    msp->ms_loaded ? range_tree_add : NULL,
 		    msp->ms_allocatable);
 	}
 
 	msp->ms_synced_length = space_map_length(msp->ms_sm);
 
 	msp->ms_deferspace += defer_delta;
 	ASSERT3S(msp->ms_deferspace, >=, 0);
 	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
 	if (msp->ms_deferspace != 0) {
 		/*
 		 * Keep syncing this metaslab until all deferred frees
 		 * are back in circulation.
 		 */
 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 	}
 	metaslab_aux_histograms_update_done(msp, defer_allowed);
 
 	if (msp->ms_new) {
 		msp->ms_new = B_FALSE;
 		mutex_enter(&mg->mg_lock);
 		mg->mg_ms_ready++;
 		mutex_exit(&mg->mg_lock);
 	}
 
 	/*
 	 * Re-sort metaslab within its group now that we've adjusted
 	 * its allocatable space.
 	 */
 	metaslab_recalculate_weight_and_sort(msp);
 
 	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
 	ASSERT0(range_tree_space(msp->ms_freeing));
 	ASSERT0(range_tree_space(msp->ms_freed));
 	ASSERT0(range_tree_space(msp->ms_checkpointing));
 	msp->ms_allocating_total -= msp->ms_allocated_this_txg;
 	msp->ms_allocated_this_txg = 0;
 	mutex_exit(&msp->ms_lock);
 }
 
 void
 metaslab_sync_reassess(metaslab_group_t *mg)
 {
 	spa_t *spa = mg->mg_class->mc_spa;
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 	metaslab_group_alloc_update(mg);
 	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
 
 	/*
 	 * Preload the next potential metaslabs but only on active
 	 * metaslab groups. We can get into a state where the metaslab
 	 * is no longer active since we dirty metaslabs as we remove a
 	 * a device, thus potentially making the metaslab group eligible
 	 * for preloading.
 	 */
 	if (mg->mg_activation_count > 0) {
 		metaslab_group_preload(mg);
 	}
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 }
 
 /*
  * When writing a ditto block (i.e. more than one DVA for a given BP) on
  * the same vdev as an existing DVA of this BP, then try to allocate it
  * on a different metaslab than existing DVAs (i.e. a unique metaslab).
  */
 static boolean_t
 metaslab_is_unique(metaslab_t *msp, dva_t *dva)
 {
 	uint64_t dva_ms_id;
 
 	if (DVA_GET_ASIZE(dva) == 0)
 		return (B_TRUE);
 
 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
 		return (B_TRUE);
 
 	dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
 
 	return (msp->ms_id != dva_ms_id);
 }
 
 /*
  * ==========================================================================
  * Metaslab allocation tracing facility
  * ==========================================================================
  */
 
 /*
  * Add an allocation trace element to the allocation tracing list.
  */
 static void
 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
     int allocator)
 {
 	metaslab_alloc_trace_t *mat;
 
 	if (!metaslab_trace_enabled)
 		return;
 
 	/*
 	 * When the tracing list reaches its maximum we remove
 	 * the second element in the list before adding a new one.
 	 * By removing the second element we preserve the original
 	 * entry as a clue to what allocations steps have already been
 	 * performed.
 	 */
 	if (zal->zal_size == metaslab_trace_max_entries) {
 		metaslab_alloc_trace_t *mat_next;
 #ifdef ZFS_DEBUG
 		panic("too many entries in allocation list");
 #endif
 		METASLABSTAT_BUMP(metaslabstat_trace_over_limit);
 		zal->zal_size--;
 		mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
 		list_remove(&zal->zal_list, mat_next);
 		kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
 	}
 
 	mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
 	list_link_init(&mat->mat_list_node);
 	mat->mat_mg = mg;
 	mat->mat_msp = msp;
 	mat->mat_size = psize;
 	mat->mat_dva_id = dva_id;
 	mat->mat_offset = offset;
 	mat->mat_weight = 0;
 	mat->mat_allocator = allocator;
 
 	if (msp != NULL)
 		mat->mat_weight = msp->ms_weight;
 
 	/*
 	 * The list is part of the zio so locking is not required. Only
 	 * a single thread will perform allocations for a given zio.
 	 */
 	list_insert_tail(&zal->zal_list, mat);
 	zal->zal_size++;
 
 	ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
 }
 
 void
 metaslab_trace_init(zio_alloc_list_t *zal)
 {
 	list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
 	    offsetof(metaslab_alloc_trace_t, mat_list_node));
 	zal->zal_size = 0;
 }
 
 void
 metaslab_trace_fini(zio_alloc_list_t *zal)
 {
 	metaslab_alloc_trace_t *mat;
 
 	while ((mat = list_remove_head(&zal->zal_list)) != NULL)
 		kmem_cache_free(metaslab_alloc_trace_cache, mat);
 	list_destroy(&zal->zal_list);
 	zal->zal_size = 0;
 }
 
 /*
  * ==========================================================================
  * Metaslab block operations
  * ==========================================================================
  */
 
 static void
 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag,
     int flags, int allocator)
 {
 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
 	    (flags & METASLAB_DONT_THROTTLE))
 		return;
 
 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 	if (!mg->mg_class->mc_alloc_throttle_enabled)
 		return;
 
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 	(void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag);
 }
 
 static void
 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
 {
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 	metaslab_class_allocator_t *mca =
 	    &mg->mg_class->mc_allocator[allocator];
 	uint64_t max = mg->mg_max_alloc_queue_depth;
 	uint64_t cur = mga->mga_cur_max_alloc_queue_depth;
 	while (cur < max) {
 		if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth,
 		    cur, cur + 1) == cur) {
 			atomic_inc_64(&mca->mca_alloc_max_slots);
 			return;
 		}
 		cur = mga->mga_cur_max_alloc_queue_depth;
 	}
 }
 
 void
 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, const void *tag,
     int flags, int allocator, boolean_t io_complete)
 {
 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
 	    (flags & METASLAB_DONT_THROTTLE))
 		return;
 
 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 	if (!mg->mg_class->mc_alloc_throttle_enabled)
 		return;
 
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 	(void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag);
 	if (io_complete)
 		metaslab_group_increment_qdepth(mg, allocator);
 }
 
 void
 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, const void *tag,
     int allocator)
 {
 #ifdef ZFS_DEBUG
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
 
 	for (int d = 0; d < ndvas; d++) {
 		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
 		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 		VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag));
 	}
 #endif
 }
 
 static uint64_t
 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 {
 	uint64_t start;
 	range_tree_t *rt = msp->ms_allocatable;
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	VERIFY(!msp->ms_condensing);
 	VERIFY0(msp->ms_disabled);
 	VERIFY0(msp->ms_new);
 
 	start = mc->mc_ops->msop_alloc(msp, size);
 	if (start != -1ULL) {
 		metaslab_group_t *mg = msp->ms_group;
 		vdev_t *vd = mg->mg_vd;
 
 		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
 		range_tree_remove(rt, start, size);
 		range_tree_clear(msp->ms_trim, start, size);
 
 		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
 		range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
 		msp->ms_allocating_total += size;
 
 		/* Track the last successful allocation */
 		msp->ms_alloc_txg = txg;
 		metaslab_verify_space(msp, txg);
 	}
 
 	/*
 	 * Now that we've attempted the allocation we need to update the
 	 * metaslab's maximum block size since it may have changed.
 	 */
 	msp->ms_max_size = metaslab_largest_allocatable(msp);
 	return (start);
 }
 
 /*
  * Find the metaslab with the highest weight that is less than what we've
  * already tried.  In the common case, this means that we will examine each
  * metaslab at most once. Note that concurrent callers could reorder metaslabs
  * by activation/passivation once we have dropped the mg_lock. If a metaslab is
  * activated by another thread, and we fail to allocate from the metaslab we
  * have selected, we may not try the newly-activated metaslab, and instead
  * activate another metaslab.  This is not optimal, but generally does not cause
  * any problems (a possible exception being if every metaslab is completely full
  * except for the newly-activated metaslab which we fail to examine).
  */
 static metaslab_t *
 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
     dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
     boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
     boolean_t *was_active)
 {
 	avl_index_t idx;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	metaslab_t *msp = avl_find(t, search, &idx);
 	if (msp == NULL)
 		msp = avl_nearest(t, idx, AVL_AFTER);
 
 	uint_t tries = 0;
 	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
 		int i;
 
 		if (!try_hard && tries > zfs_metaslab_find_max_tries) {
 			METASLABSTAT_BUMP(metaslabstat_too_many_tries);
 			return (NULL);
 		}
 		tries++;
 
 		if (!metaslab_should_allocate(msp, asize, try_hard)) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_TOO_SMALL, allocator);
 			continue;
 		}
 
 		/*
 		 * If the selected metaslab is condensing or disabled, or
 		 * hasn't gone through a metaslab_sync_done(), then skip it.
 		 */
 		if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new)
 			continue;
 
 		*was_active = msp->ms_allocator != -1;
 		/*
 		 * If we're activating as primary, this is our first allocation
 		 * from this disk, so we don't need to check how close we are.
 		 * If the metaslab under consideration was already active,
 		 * we're getting desperate enough to steal another allocator's
 		 * metaslab, so we still don't care about distances.
 		 */
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
 			break;
 
 		for (i = 0; i < d; i++) {
 			if (want_unique &&
 			    !metaslab_is_unique(msp, &dva[i]))
 				break;  /* try another metaslab */
 		}
 		if (i == d)
 			break;
 	}
 
 	if (msp != NULL) {
 		search->ms_weight = msp->ms_weight;
 		search->ms_start = msp->ms_start + 1;
 		search->ms_allocator = msp->ms_allocator;
 		search->ms_primary = msp->ms_primary;
 	}
 	return (msp);
 }
 
 static void
 metaslab_active_mask_verify(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 		return;
 
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
 		return;
 
 	if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
 		VERIFY3S(msp->ms_allocator, !=, -1);
 		VERIFY(msp->ms_primary);
 		return;
 	}
 
 	if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
 		VERIFY3S(msp->ms_allocator, !=, -1);
 		VERIFY(!msp->ms_primary);
 		return;
 	}
 
 	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 		VERIFY3S(msp->ms_allocator, ==, -1);
 		return;
 	}
 }
 
 static uint64_t
 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
     int allocator, boolean_t try_hard)
 {
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
 
 	uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY;
 	for (int i = 0; i < d; i++) {
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
 		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 			activation_weight = METASLAB_WEIGHT_SECONDARY;
 		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
 		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 			activation_weight = METASLAB_WEIGHT_CLAIM;
 			break;
 		}
 	}
 
 	/*
 	 * If we don't have enough metaslabs active to fill the entire array, we
 	 * just use the 0th slot.
 	 */
 	if (mg->mg_ms_ready < mg->mg_allocators * 3)
 		allocator = 0;
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 
 	ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
 
 	metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
 	search->ms_weight = UINT64_MAX;
 	search->ms_start = 0;
 	/*
 	 * At the end of the metaslab tree are the already-active metaslabs,
 	 * first the primaries, then the secondaries. When we resume searching
 	 * through the tree, we need to consider ms_allocator and ms_primary so
 	 * we start in the location right after where we left off, and don't
 	 * accidentally loop forever considering the same metaslabs.
 	 */
 	search->ms_allocator = -1;
 	search->ms_primary = B_TRUE;
 	for (;;) {
 		boolean_t was_active = B_FALSE;
 
 		mutex_enter(&mg->mg_lock);
 
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
 		    mga->mga_primary != NULL) {
 			msp = mga->mga_primary;
 
 			/*
 			 * Even though we don't hold the ms_lock for the
 			 * primary metaslab, those fields should not
 			 * change while we hold the mg_lock. Thus it is
 			 * safe to make assertions on them.
 			 */
 			ASSERT(msp->ms_primary);
 			ASSERT3S(msp->ms_allocator, ==, allocator);
 			ASSERT(msp->ms_loaded);
 
 			was_active = B_TRUE;
 			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
 		    mga->mga_secondary != NULL) {
 			msp = mga->mga_secondary;
 
 			/*
 			 * See comment above about the similar assertions
 			 * for the primary metaslab.
 			 */
 			ASSERT(!msp->ms_primary);
 			ASSERT3S(msp->ms_allocator, ==, allocator);
 			ASSERT(msp->ms_loaded);
 
 			was_active = B_TRUE;
 			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 		} else {
 			msp = find_valid_metaslab(mg, activation_weight, dva, d,
 			    want_unique, asize, allocator, try_hard, zal,
 			    search, &was_active);
 		}
 
 		mutex_exit(&mg->mg_lock);
 		if (msp == NULL) {
 			kmem_free(search, sizeof (*search));
 			return (-1ULL);
 		}
 		mutex_enter(&msp->ms_lock);
 
 		metaslab_active_mask_verify(msp);
 
 		/*
 		 * This code is disabled out because of issues with
 		 * tracepoints in non-gpl kernel modules.
 		 */
 #if 0
 		DTRACE_PROBE3(ms__activation__attempt,
 		    metaslab_t *, msp, uint64_t, activation_weight,
 		    boolean_t, was_active);
 #endif
 
 		/*
 		 * Ensure that the metaslab we have selected is still
 		 * capable of handling our request. It's possible that
 		 * another thread may have changed the weight while we
 		 * were blocked on the metaslab lock. We check the
 		 * active status first to see if we need to set_selected_txg
 		 * a new metaslab.
 		 */
 		if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
 			ASSERT3S(msp->ms_allocator, ==, -1);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		/*
 		 * If the metaslab was activated for another allocator
 		 * while we were waiting in the ms_lock above, or it's
 		 * a primary and we're seeking a secondary (or vice versa),
 		 * we go back and select a new metaslab.
 		 */
 		if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
 		    (msp->ms_allocator != -1) &&
 		    (msp->ms_allocator != allocator || ((activation_weight ==
 		    METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
 			ASSERT(msp->ms_loaded);
 			ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
 			    msp->ms_allocator != -1);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		/*
 		 * This metaslab was used for claiming regions allocated
 		 * by the ZIL during pool import. Once these regions are
 		 * claimed we don't need to keep the CLAIM bit set
 		 * anymore. Passivate this metaslab to zero its activation
 		 * mask.
 		 */
 		if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
 		    activation_weight != METASLAB_WEIGHT_CLAIM) {
 			ASSERT(msp->ms_loaded);
 			ASSERT3S(msp->ms_allocator, ==, -1);
 			metaslab_passivate(msp, msp->ms_weight &
 			    ~METASLAB_WEIGHT_CLAIM);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		metaslab_set_selected_txg(msp, txg);
 
 		int activation_error =
 		    metaslab_activate(msp, allocator, activation_weight);
 		metaslab_active_mask_verify(msp);
 
 		/*
 		 * If the metaslab was activated by another thread for
 		 * another allocator or activation_weight (EBUSY), or it
 		 * failed because another metaslab was assigned as primary
 		 * for this allocator (EEXIST) we continue using this
 		 * metaslab for our allocation, rather than going on to a
 		 * worse metaslab (we waited for that metaslab to be loaded
 		 * after all).
 		 *
 		 * If the activation failed due to an I/O error or ENOSPC we
 		 * skip to the next metaslab.
 		 */
 		boolean_t activated;
 		if (activation_error == 0) {
 			activated = B_TRUE;
 		} else if (activation_error == EBUSY ||
 		    activation_error == EEXIST) {
 			activated = B_FALSE;
 		} else {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 		ASSERT(msp->ms_loaded);
 
 		/*
 		 * Now that we have the lock, recheck to see if we should
 		 * continue to use this metaslab for this allocation. The
 		 * the metaslab is now loaded so metaslab_should_allocate()
 		 * can accurately determine if the allocation attempt should
 		 * proceed.
 		 */
 		if (!metaslab_should_allocate(msp, asize, try_hard)) {
 			/* Passivate this metaslab and select a new one. */
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_TOO_SMALL, allocator);
 			goto next;
 		}
 
 		/*
 		 * If this metaslab is currently condensing then pick again
 		 * as we can't manipulate this metaslab until it's committed
 		 * to disk. If this metaslab is being initialized, we shouldn't
 		 * allocate from it since the allocated region might be
 		 * overwritten after allocation.
 		 */
 		if (msp->ms_condensing) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_CONDENSING, allocator);
 			if (activated) {
 				metaslab_passivate(msp, msp->ms_weight &
 				    ~METASLAB_ACTIVE_MASK);
 			}
 			mutex_exit(&msp->ms_lock);
 			continue;
 		} else if (msp->ms_disabled > 0) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_DISABLED, allocator);
 			if (activated) {
 				metaslab_passivate(msp, msp->ms_weight &
 				    ~METASLAB_ACTIVE_MASK);
 			}
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		offset = metaslab_block_alloc(msp, asize, txg);
 		metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
 
 		if (offset != -1ULL) {
 			/* Proactively passivate the metaslab, if needed */
 			if (activated)
 				metaslab_segment_may_passivate(msp);
 			break;
 		}
 next:
 		ASSERT(msp->ms_loaded);
 
 		/*
 		 * This code is disabled out because of issues with
 		 * tracepoints in non-gpl kernel modules.
 		 */
 #if 0
 		DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp,
 		    uint64_t, asize);
 #endif
 
 		/*
 		 * We were unable to allocate from this metaslab so determine
 		 * a new weight for this metaslab. Now that we have loaded
 		 * the metaslab we can provide a better hint to the metaslab
 		 * selector.
 		 *
 		 * For space-based metaslabs, we use the maximum block size.
 		 * This information is only available when the metaslab
 		 * is loaded and is more accurate than the generic free
 		 * space weight that was calculated by metaslab_weight().
 		 * This information allows us to quickly compare the maximum
 		 * available allocation in the metaslab to the allocation
 		 * size being requested.
 		 *
 		 * For segment-based metaslabs, determine the new weight
 		 * based on the highest bucket in the range tree. We
 		 * explicitly use the loaded segment weight (i.e. the range
 		 * tree histogram) since it contains the space that is
 		 * currently available for allocation and is accurate
 		 * even within a sync pass.
 		 */
 		uint64_t weight;
 		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
 			weight = metaslab_largest_allocatable(msp);
 			WEIGHT_SET_SPACEBASED(weight);
 		} else {
 			weight = metaslab_weight_from_range_tree(msp);
 		}
 
 		if (activated) {
 			metaslab_passivate(msp, weight);
 		} else {
 			/*
 			 * For the case where we use the metaslab that is
 			 * active for another allocator we want to make
 			 * sure that we retain the activation mask.
 			 *
 			 * Note that we could attempt to use something like
 			 * metaslab_recalculate_weight_and_sort() that
 			 * retains the activation mask here. That function
 			 * uses metaslab_weight() to set the weight though
 			 * which is not as accurate as the calculations
 			 * above.
 			 */
 			weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
 			metaslab_group_sort(mg, msp, weight);
 		}
 		metaslab_active_mask_verify(msp);
 
 		/*
 		 * We have just failed an allocation attempt, check
 		 * that metaslab_should_allocate() agrees. Otherwise,
 		 * we may end up in an infinite loop retrying the same
 		 * metaslab.
 		 */
 		ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
 
 		mutex_exit(&msp->ms_lock);
 	}
 	mutex_exit(&msp->ms_lock);
 	kmem_free(search, sizeof (*search));
 	return (offset);
 }
 
 static uint64_t
 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
     int allocator, boolean_t try_hard)
 {
 	uint64_t offset;
 
 	offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
 	    dva, d, allocator, try_hard);
 
 	mutex_enter(&mg->mg_lock);
 	if (offset == -1ULL) {
 		mg->mg_failed_allocations++;
 		metaslab_trace_add(zal, mg, NULL, asize, d,
 		    TRACE_GROUP_FAILURE, allocator);
 		if (asize == SPA_GANGBLOCKSIZE) {
 			/*
 			 * This metaslab group was unable to allocate
 			 * the minimum gang block size so it must be out of
 			 * space. We must notify the allocation throttle
 			 * to start skipping allocation attempts to this
 			 * metaslab group until more space becomes available.
 			 * Note: this failure cannot be caused by the
 			 * allocation throttle since the allocation throttle
 			 * is only responsible for skipping devices and
 			 * not failing block allocations.
 			 */
 			mg->mg_no_free_space = B_TRUE;
 		}
 	}
 	mg->mg_allocations++;
 	mutex_exit(&mg->mg_lock);
 	return (offset);
 }
 
 /*
  * Allocate a block for the specified i/o.
  */
 int
 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
     zio_alloc_list_t *zal, int allocator)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 	metaslab_group_t *mg, *rotor;
 	vdev_t *vd;
 	boolean_t try_hard = B_FALSE;
 
 	ASSERT(!DVA_IS_VALID(&dva[d]));
 
 	/*
 	 * For testing, make some blocks above a certain size be gang blocks.
 	 * This will result in more split blocks when using device removal,
 	 * and a large number of split blocks coupled with ztest-induced
 	 * damage can result in extremely long reconstruction times.  This
 	 * will also test spilling from special to normal.
 	 */
 	if (psize >= metaslab_force_ganging &&
 	    metaslab_force_ganging_pct > 0 &&
 	    (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) {
 		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
 		    allocator);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	/*
 	 * Start at the rotor and loop through all mgs until we find something.
 	 * Note that there's no locking on mca_rotor or mca_aliquot because
 	 * nothing actually breaks if we miss a few updates -- we just won't
 	 * allocate quite as evenly.  It all balances out over time.
 	 *
 	 * If we are doing ditto or log blocks, try to spread them across
 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
 	 * allocated all of our ditto blocks, then try and spread them out on
 	 * that vdev as much as possible.  If it turns out to not be possible,
 	 * gradually lower our standards until anything becomes acceptable.
 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
 	 * gives us hope of containing our fault domains to something we're
 	 * able to reason about.  Otherwise, any two top-level vdev failures
 	 * will guarantee the loss of data.  With consecutive allocation,
 	 * only two adjacent top-level vdev failures will result in data loss.
 	 *
 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
 	 * ourselves on the same vdev as our gang block header.  That
 	 * way, we can hope for locality in vdev_cache, plus it makes our
 	 * fault domains something tractable.
 	 */
 	if (hintdva) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
 
 		/*
 		 * It's possible the vdev we're using as the hint no
 		 * longer exists or its mg has been closed (e.g. by
 		 * device removal).  Consult the rotor when
 		 * all else fails.
 		 */
 		if (vd != NULL && vd->vdev_mg != NULL) {
 			mg = vdev_get_mg(vd, mc);
 
 			if (flags & METASLAB_HINTBP_AVOID)
 				mg = mg->mg_next;
 		} else {
 			mg = mca->mca_rotor;
 		}
 	} else if (d != 0) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
 		mg = vd->vdev_mg->mg_next;
 	} else {
 		ASSERT(mca->mca_rotor != NULL);
 		mg = mca->mca_rotor;
 	}
 
 	/*
 	 * If the hint put us into the wrong metaslab class, or into a
 	 * metaslab group that has been passivated, just follow the rotor.
 	 */
 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
 		mg = mca->mca_rotor;
 
 	rotor = mg;
 top:
 	do {
 		boolean_t allocatable;
 
 		ASSERT(mg->mg_activation_count == 1);
 		vd = mg->mg_vd;
 
 		/*
 		 * Don't allocate from faulted devices.
 		 */
 		if (try_hard) {
 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
 			allocatable = vdev_allocatable(vd);
 			spa_config_exit(spa, SCL_ZIO, FTAG);
 		} else {
 			allocatable = vdev_allocatable(vd);
 		}
 
 		/*
 		 * Determine if the selected metaslab group is eligible
 		 * for allocations. If we're ganging then don't allow
 		 * this metaslab group to skip allocations since that would
 		 * inadvertently return ENOSPC and suspend the pool
 		 * even though space is still available.
 		 */
 		if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
 			allocatable = metaslab_group_allocatable(mg, rotor,
 			    flags, psize, allocator, d);
 		}
 
 		if (!allocatable) {
 			metaslab_trace_add(zal, mg, NULL, psize, d,
 			    TRACE_NOT_ALLOCATABLE, allocator);
 			goto next;
 		}
 
 		/*
 		 * Avoid writing single-copy data to an unhealthy,
 		 * non-redundant vdev, unless we've already tried all
 		 * other vdevs.
 		 */
 		if (vd->vdev_state < VDEV_STATE_HEALTHY &&
 		    d == 0 && !try_hard && vd->vdev_children == 0) {
 			metaslab_trace_add(zal, mg, NULL, psize, d,
 			    TRACE_VDEV_ERROR, allocator);
 			goto next;
 		}
 
 		ASSERT(mg->mg_class == mc);
 
 		uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
 		/*
 		 * If we don't need to try hard, then require that the
 		 * block be on a different metaslab from any other DVAs
 		 * in this BP (unique=true).  If we are trying hard, then
 		 * allow any metaslab to be used (unique=false).
 		 */
 		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
 		    !try_hard, dva, d, allocator, try_hard);
 
 		if (offset != -1ULL) {
 			/*
 			 * If we've just selected this metaslab group,
 			 * figure out whether the corresponding vdev is
 			 * over- or under-used relative to the pool,
 			 * and set an allocation bias to even it out.
 			 *
 			 * Bias is also used to compensate for unequally
 			 * sized vdevs so that space is allocated fairly.
 			 */
 			if (mca->mca_aliquot == 0 && metaslab_bias_enabled) {
 				vdev_stat_t *vs = &vd->vdev_stat;
 				int64_t vs_free = vs->vs_space - vs->vs_alloc;
 				int64_t mc_free = mc->mc_space - mc->mc_alloc;
 				int64_t ratio;
 
 				/*
 				 * Calculate how much more or less we should
 				 * try to allocate from this device during
 				 * this iteration around the rotor.
 				 *
 				 * This basically introduces a zero-centered
 				 * bias towards the devices with the most
 				 * free space, while compensating for vdev
 				 * size differences.
 				 *
 				 * Examples:
 				 *  vdev V1 = 16M/128M
 				 *  vdev V2 = 16M/128M
 				 *  ratio(V1) = 100% ratio(V2) = 100%
 				 *
 				 *  vdev V1 = 16M/128M
 				 *  vdev V2 = 64M/128M
 				 *  ratio(V1) = 127% ratio(V2) =  72%
 				 *
 				 *  vdev V1 = 16M/128M
 				 *  vdev V2 = 64M/512M
 				 *  ratio(V1) =  40% ratio(V2) = 160%
 				 */
 				ratio = (vs_free * mc->mc_alloc_groups * 100) /
 				    (mc_free + 1);
 				mg->mg_bias = ((ratio - 100) *
 				    (int64_t)mg->mg_aliquot) / 100;
 			} else if (!metaslab_bias_enabled) {
 				mg->mg_bias = 0;
 			}
 
 			if ((flags & METASLAB_ZIL) ||
 			    atomic_add_64_nv(&mca->mca_aliquot, asize) >=
 			    mg->mg_aliquot + mg->mg_bias) {
 				mca->mca_rotor = mg->mg_next;
 				mca->mca_aliquot = 0;
 			}
 
 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
 			DVA_SET_OFFSET(&dva[d], offset);
 			DVA_SET_GANG(&dva[d],
 			    ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
 			DVA_SET_ASIZE(&dva[d], asize);
 
 			return (0);
 		}
 next:
 		mca->mca_rotor = mg->mg_next;
 		mca->mca_aliquot = 0;
 	} while ((mg = mg->mg_next) != rotor);
 
 	/*
 	 * If we haven't tried hard, perhaps do so now.
 	 */
 	if (!try_hard && (zfs_metaslab_try_hard_before_gang ||
 	    GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 ||
 	    psize <= 1 << spa->spa_min_ashift)) {
 		METASLABSTAT_BUMP(metaslabstat_try_hard);
 		try_hard = B_TRUE;
 		goto top;
 	}
 
 	memset(&dva[d], 0, sizeof (dva_t));
 
 	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
 	return (SET_ERROR(ENOSPC));
 }
 
 void
 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
     boolean_t checkpoint)
 {
 	metaslab_t *msp;
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY3U(offset, >=, msp->ms_start);
 	VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
 
 	metaslab_check_free_impl(vd, offset, asize);
 
 	mutex_enter(&msp->ms_lock);
 	if (range_tree_is_empty(msp->ms_freeing) &&
 	    range_tree_is_empty(msp->ms_checkpointing)) {
 		vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
 	}
 
 	if (checkpoint) {
 		ASSERT(spa_has_checkpoint(spa));
 		range_tree_add(msp->ms_checkpointing, offset, asize);
 	} else {
 		range_tree_add(msp->ms_freeing, offset, asize);
 	}
 	mutex_exit(&msp->ms_lock);
 }
 
 void
 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	(void) inner_offset;
 	boolean_t *checkpoint = arg;
 
 	ASSERT3P(checkpoint, !=, NULL);
 
 	if (vd->vdev_ops->vdev_op_remap != NULL)
 		vdev_indirect_mark_obsolete(vd, offset, size);
 	else
 		metaslab_free_impl(vd, offset, size, *checkpoint);
 }
 
 static void
 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
     boolean_t checkpoint)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
 		return;
 
 	if (spa->spa_vdev_removal != NULL &&
 	    spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
 	    vdev_is_concrete(vd)) {
 		/*
 		 * Note: we check if the vdev is concrete because when
 		 * we complete the removal, we first change the vdev to be
 		 * an indirect vdev (in open context), and then (in syncing
 		 * context) clear spa_vdev_removal.
 		 */
 		free_from_removing_vdev(vd, offset, size);
 	} else if (vd->vdev_ops->vdev_op_remap != NULL) {
 		vdev_indirect_mark_obsolete(vd, offset, size);
 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
 		    metaslab_free_impl_cb, &checkpoint);
 	} else {
 		metaslab_free_concrete(vd, offset, size, checkpoint);
 	}
 }
 
 typedef struct remap_blkptr_cb_arg {
 	blkptr_t *rbca_bp;
 	spa_remap_cb_t rbca_cb;
 	vdev_t *rbca_remap_vd;
 	uint64_t rbca_remap_offset;
 	void *rbca_cb_arg;
 } remap_blkptr_cb_arg_t;
 
 static void
 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	remap_blkptr_cb_arg_t *rbca = arg;
 	blkptr_t *bp = rbca->rbca_bp;
 
 	/* We can not remap split blocks. */
 	if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
 		return;
 	ASSERT0(inner_offset);
 
 	if (rbca->rbca_cb != NULL) {
 		/*
 		 * At this point we know that we are not handling split
 		 * blocks and we invoke the callback on the previous
 		 * vdev which must be indirect.
 		 */
 		ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
 
 		rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
 		    rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
 
 		/* set up remap_blkptr_cb_arg for the next call */
 		rbca->rbca_remap_vd = vd;
 		rbca->rbca_remap_offset = offset;
 	}
 
 	/*
 	 * The phys birth time is that of dva[0].  This ensures that we know
 	 * when each dva was written, so that resilver can determine which
 	 * blocks need to be scrubbed (i.e. those written during the time
 	 * the vdev was offline).  It also ensures that the key used in
 	 * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
 	 * we didn't change the phys_birth, a lookup in the ARC for a
 	 * remapped BP could find the data that was previously stored at
 	 * this vdev + offset.
 	 */
 	vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
 	    DVA_GET_VDEV(&bp->blk_dva[0]));
 	vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
-	bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
+	uint64_t physical_birth = vdev_indirect_births_physbirth(vib,
 	    DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
+	BP_SET_PHYSICAL_BIRTH(bp, physical_birth);
 
 	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&bp->blk_dva[0], offset);
 }
 
 /*
  * If the block pointer contains any indirect DVAs, modify them to refer to
  * concrete DVAs.  Note that this will sometimes not be possible, leaving
  * the indirect DVA in place.  This happens if the indirect DVA spans multiple
  * segments in the mapping (i.e. it is a "split block").
  *
  * If the BP was remapped, calls the callback on the original dva (note the
  * callback can be called multiple times if the original indirect DVA refers
  * to another indirect DVA, etc).
  *
  * Returns TRUE if the BP was remapped.
  */
 boolean_t
 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
 {
 	remap_blkptr_cb_arg_t rbca;
 
 	if (!zfs_remap_blkptr_enable)
 		return (B_FALSE);
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
 		return (B_FALSE);
 
 	/*
 	 * Dedup BP's can not be remapped, because ddt_phys_select() depends
 	 * on DVA[0] being the same in the BP as in the DDT (dedup table).
 	 */
 	if (BP_GET_DEDUP(bp))
 		return (B_FALSE);
 
 	/*
 	 * Gang blocks can not be remapped, because
 	 * zio_checksum_gang_verifier() depends on the DVA[0] that's in
 	 * the BP used to read the gang block header (GBH) being the same
 	 * as the DVA[0] that we allocated for the GBH.
 	 */
 	if (BP_IS_GANG(bp))
 		return (B_FALSE);
 
 	/*
 	 * Embedded BP's have no DVA to remap.
 	 */
 	if (BP_GET_NDVAS(bp) < 1)
 		return (B_FALSE);
 
 	/*
 	 * Note: we only remap dva[0].  If we remapped other dvas, we
 	 * would no longer know what their phys birth txg is.
 	 */
 	dva_t *dva = &bp->blk_dva[0];
 
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
 
 	if (vd->vdev_ops->vdev_op_remap == NULL)
 		return (B_FALSE);
 
 	rbca.rbca_bp = bp;
 	rbca.rbca_cb = callback;
 	rbca.rbca_remap_vd = vd;
 	rbca.rbca_remap_offset = offset;
 	rbca.rbca_cb_arg = arg;
 
 	/*
 	 * remap_blkptr_cb() will be called in order for each level of
 	 * indirection, until a concrete vdev is reached or a split block is
 	 * encountered. old_vd and old_offset are updated within the callback
 	 * as we go from the one indirect vdev to the next one (either concrete
 	 * or indirect again) in that order.
 	 */
 	vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
 
 	/* Check if the DVA wasn't remapped because it is a split block */
 	if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 /*
  * Undo the allocation of a DVA which happened in the given transaction group.
  */
 void
 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 {
 	metaslab_t *msp;
 	vdev_t *vd;
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 
 	ASSERT(DVA_IS_VALID(dva));
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	if (txg > spa_freeze_txg(spa))
 		return;
 
 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) ||
 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
 		zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu",
 		    (u_longlong_t)vdev, (u_longlong_t)offset,
 		    (u_longlong_t)size);
 		return;
 	}
 
 	ASSERT(!vd->vdev_removing);
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
 	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
 
 	if (DVA_GET_GANG(dva))
 		size = vdev_gang_header_asize(vd);
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	mutex_enter(&msp->ms_lock);
 	range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
 	    offset, size);
 	msp->ms_allocating_total -= size;
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY3U(offset, >=, msp->ms_start);
 	VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
 	VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
 	    msp->ms_size);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 	range_tree_add(msp->ms_allocatable, offset, size);
 	mutex_exit(&msp->ms_lock);
 }
 
 /*
  * Free the block represented by the given DVA.
  */
 void
 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
 {
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd = vdev_lookup_top(spa, vdev);
 
 	ASSERT(DVA_IS_VALID(dva));
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	if (DVA_GET_GANG(dva)) {
 		size = vdev_gang_header_asize(vd);
 	}
 
 	metaslab_free_impl(vd, offset, size, checkpoint);
 }
 
 /*
  * Reserve some allocation slots. The reservation system must be called
  * before we call into the allocator. If there aren't any available slots
  * then the I/O will be throttled until an I/O completes and its slots are
  * freed up. The function returns true if it was successful in placing
  * the reservation.
  */
 boolean_t
 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
     zio_t *zio, int flags)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 	uint64_t max = mca->mca_alloc_max_slots;
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
 	if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) ||
 	    zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) {
 		/*
 		 * The potential race between _count() and _add() is covered
 		 * by the allocator lock in most cases, or irrelevant due to
 		 * GANG_ALLOCATION() or METASLAB_MUST_RESERVE set in others.
 		 * But even if we assume some other non-existing scenario, the
 		 * worst that can happen is few more I/Os get to allocation
 		 * earlier, that is not a problem.
 		 *
 		 * We reserve the slots individually so that we can unreserve
 		 * them individually when an I/O completes.
 		 */
 		zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio);
 		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 void
 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
     int allocator, zio_t *zio)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
 	zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio);
 }
 
 static int
 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
     uint64_t txg)
 {
 	metaslab_t *msp;
 	spa_t *spa = vd->vdev_spa;
 	int error = 0;
 
 	if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
 		return (SET_ERROR(ENXIO));
 
 	ASSERT3P(vd->vdev_ms, !=, NULL);
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	mutex_enter(&msp->ms_lock);
 
 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) {
 		error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
 		if (error == EBUSY) {
 			ASSERT(msp->ms_loaded);
 			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 			error = 0;
 		}
 	}
 
 	if (error == 0 &&
 	    !range_tree_contains(msp->ms_allocatable, offset, size))
 		error = SET_ERROR(ENOENT);
 
 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
 		mutex_exit(&msp->ms_lock);
 		return (error);
 	}
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 	VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
 	    msp->ms_size);
 	range_tree_remove(msp->ms_allocatable, offset, size);
 	range_tree_clear(msp->ms_trim, offset, size);
 
 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(8) */
 		metaslab_class_t *mc = msp->ms_group->mg_class;
 		multilist_sublist_t *mls =
 		    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 		if (!multilist_link_active(&msp->ms_class_txg_node)) {
 			msp->ms_selected_txg = txg;
 			multilist_sublist_insert_head(mls, msp);
 		}
 		multilist_sublist_unlock(mls);
 
 		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
 		range_tree_add(msp->ms_allocating[txg & TXG_MASK],
 		    offset, size);
 		msp->ms_allocating_total += size;
 	}
 
 	mutex_exit(&msp->ms_lock);
 
 	return (0);
 }
 
 typedef struct metaslab_claim_cb_arg_t {
 	uint64_t	mcca_txg;
 	int		mcca_error;
 } metaslab_claim_cb_arg_t;
 
 static void
 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	(void) inner_offset;
 	metaslab_claim_cb_arg_t *mcca_arg = arg;
 
 	if (mcca_arg->mcca_error == 0) {
 		mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
 		    size, mcca_arg->mcca_txg);
 	}
 }
 
 int
 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
 {
 	if (vd->vdev_ops->vdev_op_remap != NULL) {
 		metaslab_claim_cb_arg_t arg;
 
 		/*
 		 * Only zdb(8) can claim on indirect vdevs.  This is used
 		 * to detect leaks of mapped space (that are not accounted
 		 * for in the obsolete counts, spacemap, or bpobj).
 		 */
 		ASSERT(!spa_writeable(vd->vdev_spa));
 		arg.mcca_error = 0;
 		arg.mcca_txg = txg;
 
 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
 		    metaslab_claim_impl_cb, &arg);
 
 		if (arg.mcca_error == 0) {
 			arg.mcca_error = metaslab_claim_concrete(vd,
 			    offset, size, txg);
 		}
 		return (arg.mcca_error);
 	} else {
 		return (metaslab_claim_concrete(vd, offset, size, txg));
 	}
 }
 
 /*
  * Intent log support: upon opening the pool after a crash, notify the SPA
  * of blocks that the intent log has allocated for immediate write, but
  * which are still considered free by the SPA because the last transaction
  * group didn't commit yet.
  */
 static int
 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 {
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd;
 
 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
 		return (SET_ERROR(ENXIO));
 	}
 
 	ASSERT(DVA_IS_VALID(dva));
 
 	if (DVA_GET_GANG(dva))
 		size = vdev_gang_header_asize(vd);
 
 	return (metaslab_claim_impl(vd, offset, size, txg));
 }
 
 int
 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
     zio_alloc_list_t *zal, zio_t *zio, int allocator)
 {
 	dva_t *dva = bp->blk_dva;
 	dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
 	int error = 0;
 
-	ASSERT(bp->blk_birth == 0);
-	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
+	ASSERT0(BP_GET_LOGICAL_BIRTH(bp));
+	ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
 	if (mc->mc_allocator[allocator].mca_rotor == NULL) {
 		/* no vdevs in this class */
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
 	ASSERT(BP_GET_NDVAS(bp) == 0);
 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
 	ASSERT3P(zal, !=, NULL);
 
 	for (int d = 0; d < ndvas; d++) {
 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
 		    txg, flags, zal, allocator);
 		if (error != 0) {
 			for (d--; d >= 0; d--) {
 				metaslab_unalloc_dva(spa, &dva[d], txg);
 				metaslab_group_alloc_decrement(spa,
 				    DVA_GET_VDEV(&dva[d]), zio, flags,
 				    allocator, B_FALSE);
 				memset(&dva[d], 0, sizeof (dva_t));
 			}
 			spa_config_exit(spa, SCL_ALLOC, FTAG);
 			return (error);
 		} else {
 			/*
 			 * Update the metaslab group's queue depth
 			 * based on the newly allocated dva.
 			 */
 			metaslab_group_alloc_increment(spa,
 			    DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
 		}
 	}
 	ASSERT(error == 0);
 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
 
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	BP_SET_BIRTH(bp, txg, 0);
 
 	return (0);
 }
 
 void
 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
 
 	ASSERT(!BP_IS_HOLE(bp));
-	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
+	ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa));
 
 	/*
 	 * If we have a checkpoint for the pool we need to make sure that
 	 * the blocks that we free that are part of the checkpoint won't be
 	 * reused until the checkpoint is discarded or we revert to it.
 	 *
 	 * The checkpoint flag is passed down the metaslab_free code path
 	 * and is set whenever we want to add a block to the checkpoint's
 	 * accounting. That is, we "checkpoint" blocks that existed at the
 	 * time the checkpoint was created and are therefore referenced by
 	 * the checkpointed uberblock.
 	 *
 	 * Note that, we don't checkpoint any blocks if the current
 	 * syncing txg <= spa_checkpoint_txg. We want these frees to sync
 	 * normally as they will be referenced by the checkpointed uberblock.
 	 */
 	boolean_t checkpoint = B_FALSE;
-	if (bp->blk_birth <= spa->spa_checkpoint_txg &&
+	if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg &&
 	    spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
 		/*
 		 * At this point, if the block is part of the checkpoint
 		 * there is no way it was created in the current txg.
 		 */
 		ASSERT(!now);
 		ASSERT3U(spa_syncing_txg(spa), ==, txg);
 		checkpoint = B_TRUE;
 	}
 
 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
 
 	for (int d = 0; d < ndvas; d++) {
 		if (now) {
 			metaslab_unalloc_dva(spa, &dva[d], txg);
 		} else {
 			ASSERT3U(txg, ==, spa_syncing_txg(spa));
 			metaslab_free_dva(spa, &dva[d], checkpoint);
 		}
 	}
 
 	spa_config_exit(spa, SCL_FREE, FTAG);
 }
 
 int
 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
 	int error = 0;
 
 	ASSERT(!BP_IS_HOLE(bp));
 
 	if (txg != 0) {
 		/*
 		 * First do a dry run to make sure all DVAs are claimable,
 		 * so we don't have to unwind from partial failures below.
 		 */
 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
 			return (error);
 	}
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
 	for (int d = 0; d < ndvas; d++) {
 		error = metaslab_claim_dva(spa, &dva[d], txg);
 		if (error != 0)
 			break;
 	}
 
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	ASSERT(error == 0 || txg == 0);
 
 	return (error);
 }
 
 static void
 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	(void) inner, (void) arg;
 
 	if (vd->vdev_ops == &vdev_indirect_ops)
 		return;
 
 	metaslab_check_free_impl(vd, offset, size);
 }
 
 static void
 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
 {
 	metaslab_t *msp;
 	spa_t *spa __maybe_unused = vd->vdev_spa;
 
 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
 		return;
 
 	if (vd->vdev_ops->vdev_op_remap != NULL) {
 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
 		    metaslab_check_free_impl_cb, NULL);
 		return;
 	}
 
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	mutex_enter(&msp->ms_lock);
 	if (msp->ms_loaded) {
 		range_tree_verify_not_present(msp->ms_allocatable,
 		    offset, size);
 	}
 
 	/*
 	 * Check all segments that currently exist in the freeing pipeline.
 	 *
 	 * It would intuitively make sense to also check the current allocating
 	 * tree since metaslab_unalloc_dva() exists for extents that are
 	 * allocated and freed in the same sync pass within the same txg.
 	 * Unfortunately there are places (e.g. the ZIL) where we allocate a
 	 * segment but then we free part of it within the same txg
 	 * [see zil_sync()]. Thus, we don't call range_tree_verify() in the
 	 * current allocating tree.
 	 */
 	range_tree_verify_not_present(msp->ms_freeing, offset, size);
 	range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
 	range_tree_verify_not_present(msp->ms_freed, offset, size);
 	for (int j = 0; j < TXG_DEFER_SIZE; j++)
 		range_tree_verify_not_present(msp->ms_defer[j], offset, size);
 	range_tree_verify_not_present(msp->ms_trim, offset, size);
 	mutex_exit(&msp->ms_lock);
 }
 
 void
 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
 {
 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
 		return;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 		vdev_t *vd = vdev_lookup_top(spa, vdev);
 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
 
 		if (DVA_GET_GANG(&bp->blk_dva[i]))
 			size = vdev_gang_header_asize(vd);
 
 		ASSERT3P(vd, !=, NULL);
 
 		metaslab_check_free_impl(vd, offset, size);
 	}
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 }
 
 static void
 metaslab_group_disable_wait(metaslab_group_t *mg)
 {
 	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
 	while (mg->mg_disabled_updating) {
 		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
 	}
 }
 
 static void
 metaslab_group_disabled_increment(metaslab_group_t *mg)
 {
 	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
 	ASSERT(mg->mg_disabled_updating);
 
 	while (mg->mg_ms_disabled >= max_disabled_ms) {
 		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
 	}
 	mg->mg_ms_disabled++;
 	ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
 }
 
 /*
  * Mark the metaslab as disabled to prevent any allocations on this metaslab.
  * We must also track how many metaslabs are currently disabled within a
  * metaslab group and limit them to prevent allocation failures from
  * occurring because all metaslabs are disabled.
  */
 void
 metaslab_disable(metaslab_t *msp)
 {
 	ASSERT(!MUTEX_HELD(&msp->ms_lock));
 	metaslab_group_t *mg = msp->ms_group;
 
 	mutex_enter(&mg->mg_ms_disabled_lock);
 
 	/*
 	 * To keep an accurate count of how many threads have disabled
 	 * a specific metaslab group, we only allow one thread to mark
 	 * the metaslab group at a time. This ensures that the value of
 	 * ms_disabled will be accurate when we decide to mark a metaslab
 	 * group as disabled. To do this we force all other threads
 	 * to wait till the metaslab's mg_disabled_updating flag is no
 	 * longer set.
 	 */
 	metaslab_group_disable_wait(mg);
 	mg->mg_disabled_updating = B_TRUE;
 	if (msp->ms_disabled == 0) {
 		metaslab_group_disabled_increment(mg);
 	}
 	mutex_enter(&msp->ms_lock);
 	msp->ms_disabled++;
 	mutex_exit(&msp->ms_lock);
 
 	mg->mg_disabled_updating = B_FALSE;
 	cv_broadcast(&mg->mg_ms_disabled_cv);
 	mutex_exit(&mg->mg_ms_disabled_lock);
 }
 
 void
 metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	spa_t *spa = mg->mg_vd->vdev_spa;
 
 	/*
 	 * Wait for the outstanding IO to be synced to prevent newly
 	 * allocated blocks from being overwritten.  This used by
 	 * initialize and TRIM which are modifying unallocated space.
 	 */
 	if (sync)
 		txg_wait_synced(spa_get_dsl(spa), 0);
 
 	mutex_enter(&mg->mg_ms_disabled_lock);
 	mutex_enter(&msp->ms_lock);
 	if (--msp->ms_disabled == 0) {
 		mg->mg_ms_disabled--;
 		cv_broadcast(&mg->mg_ms_disabled_cv);
 		if (unload)
 			metaslab_unload(msp);
 	}
 	mutex_exit(&msp->ms_lock);
 	mutex_exit(&mg->mg_ms_disabled_lock);
 }
 
 void
 metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty)
 {
 	ms->ms_unflushed_dirty = dirty;
 }
 
 static void
 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
 {
 	vdev_t *vd = ms->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
 
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	metaslab_unflushed_phys_t entry = {
 		.msp_unflushed_txg = metaslab_unflushed_txg(ms),
 	};
 	uint64_t entry_size = sizeof (entry);
 	uint64_t entry_offset = ms->ms_id * entry_size;
 
 	uint64_t object = 0;
 	int err = zap_lookup(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
 	    &object);
 	if (err == ENOENT) {
 		object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
 		    SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
 		VERIFY0(zap_add(mos, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
 		    &object, tx));
 	} else {
 		VERIFY0(err);
 	}
 
 	dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
 	    &entry, tx);
 }
 
 void
 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
 {
 	ms->ms_unflushed_txg = txg;
 	metaslab_update_ondisk_flush_data(ms, tx);
 }
 
 boolean_t
 metaslab_unflushed_dirty(metaslab_t *ms)
 {
 	return (ms->ms_unflushed_dirty);
 }
 
 uint64_t
 metaslab_unflushed_txg(metaslab_t *ms)
 {
 	return (ms->ms_unflushed_txg);
 }
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW,
 	"Allocation granularity (a.k.a. stripe size)");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW,
 	"Load all metaslabs when pool is first opened");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW,
 	"Prevent metaslabs from being unloaded");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW,
 	"Preload potential metaslabs during reassessment");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW,
 	"Max number of metaslabs per group to preload");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW,
 	"Delay in txgs after metaslab was last used before unloading");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW,
 	"Delay in milliseconds after metaslab was last used before unloading");
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW,
 	"Percentage of metaslab group size that should be free to make it "
 	"eligible for allocation");
 
 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW,
 	"Percentage of metaslab group size that should be considered eligible "
 	"for allocations unless all metaslab groups within the metaslab class "
 	"have also crossed this threshold");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT,
 	ZMOD_RW,
 	"Use the fragmentation metric to prefer less fragmented metaslabs");
 /* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT,
 	ZMOD_RW, "Fragmentation for metaslab to allow allocation");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW,
 	"Prefer metaslabs with lower LBAs");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW,
 	"Enable metaslab group biasing");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT,
 	ZMOD_RW, "Enable segment-based metaslab selection");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
 	"Segment-based metaslab selection maximum buckets before switching");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW,
 	"Blocks larger than this size are sometimes forced to be gang blocks");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW,
 	"Percentage of large blocks that will be forced to be gang blocks");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW,
 	"Max distance (bytes) to search forward before using size tree");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW,
 	"When looking in size tree, use largest segment instead of exact fit");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64,
 	ZMOD_RW, "How long to trust the cached max chunk size of a metaslab");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW,
 	"Percentage of memory that can be used to store metaslab range trees");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT,
 	ZMOD_RW, "Try hard to allocate before ganging");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW,
 	"Normally only consider this many of the best metaslabs in each vdev");
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator,
 	param_set_active_allocator, param_get_charp, ZMOD_RW,
 	"SPA active allocator");
 /* END CSTYLED */
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index b144d0652930..30c528a53049 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1,10824 +1,10825 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2018, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright 2018 Joyent, Inc.
  * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
  * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
  */
 
 /*
  * SPA: Storage Pool Allocator
  *
  * This file contains all the routines used when modifying on-disk SPA state.
  * This includes opening, importing, destroying, exporting a pool, and syncing a
  * pool.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_removal.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/vdev_indirect_births.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_rebuild.h>
 #include <sys/vdev_trim.h>
 #include <sys/vdev_disk.h>
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_draid.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/mmp.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/bpobj.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_objset.h>
 #include <sys/unique.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/callb.h>
 #include <sys/systeminfo.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/dsl_scan.h>
 #include <sys/zfeature.h>
 #include <sys/dsl_destroy.h>
 #include <sys/zvol.h>
 
 #ifdef	_KERNEL
 #include <sys/fm/protocol.h>
 #include <sys/fm/util.h>
 #include <sys/callb.h>
 #include <sys/zone.h>
 #include <sys/vmsystm.h>
 #endif	/* _KERNEL */
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 #include <cityhash.h>
 
 /*
  * spa_thread() existed on Illumos as a parent thread for the various worker
  * threads that actually run the pool, as a way to both reference the entire
  * pool work as a single object, and to share properties like scheduling
  * options. It has not yet been adapted to Linux or FreeBSD. This define is
  * used to mark related parts of the code to make things easier for the reader,
  * and to compile this code out. It can be removed when someone implements it,
  * moves it to some Illumos-specific place, or removes it entirely.
  */
 #undef HAVE_SPA_THREAD
 
 /*
  * The "System Duty Cycle" scheduling class is an Illumos feature to help
  * prevent CPU-intensive kernel threads from affecting latency on interactive
  * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is
  * gated behind a define. On Illumos SDC depends on spa_thread(), but
  * spa_thread() also has other uses, so this is a separate define.
  */
 #undef HAVE_SYSDC
 
 /*
  * The interval, in seconds, at which failed configuration cache file writes
  * should be retried.
  */
 int zfs_ccw_retry_interval = 300;
 
 typedef enum zti_modes {
 	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
 	ZTI_MODE_SCALE,			/* Taskqs scale with CPUs. */
 	ZTI_MODE_SYNC,			/* sync thread assigned */
 	ZTI_MODE_NULL,			/* don't create a taskq */
 	ZTI_NMODES
 } zti_modes_t;
 
 #define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
 #define	ZTI_PCT(n)	{ ZTI_MODE_ONLINE_PERCENT, (n), 1 }
 #define	ZTI_SCALE	{ ZTI_MODE_SCALE, 0, 1 }
 #define	ZTI_SYNC	{ ZTI_MODE_SYNC, 0, 1 }
 #define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
 
 #define	ZTI_N(n)	ZTI_P(n, 1)
 #define	ZTI_ONE		ZTI_N(1)
 
 typedef struct zio_taskq_info {
 	zti_modes_t zti_mode;
 	uint_t zti_value;
 	uint_t zti_count;
 } zio_taskq_info_t;
 
 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 	"iss", "iss_h", "int", "int_h"
 };
 
 /*
  * This table defines the taskq settings for each ZFS I/O type. When
  * initializing a pool, we use this table to create an appropriately sized
  * taskq. Some operations are low volume and therefore have a small, static
  * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
  * macros. Other operations process a large amount of data; the ZTI_SCALE
  * macro causes us to create a taskq oriented for throughput. Some operations
  * are so high frequency and short-lived that the taskq itself can become a
  * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
  * additional degree of parallelism specified by the number of threads per-
  * taskq and the number of taskqs; when dispatching an event in this case, the
  * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs
  * that scales with the number of CPUs.
  *
  * The different taskq priorities are to handle the different contexts (issue
  * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
  * need to be handled with minimum delay.
  */
 static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
 	{ ZTI_N(8),	ZTI_NULL,	ZTI_SCALE,	ZTI_NULL }, /* READ */
 	{ ZTI_SYNC,	ZTI_N(5),	ZTI_SCALE,	ZTI_N(5) }, /* WRITE */
 	{ ZTI_SCALE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
 	{ ZTI_N(4),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* TRIM */
 };
 
 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 static void spa_sync_props(void *arg, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 static int spa_load_impl(spa_t *spa, spa_import_type_t type,
     const char **ereport);
 static void spa_vdev_resilver_done(spa_t *spa);
 
 /*
  * Percentage of all CPUs that can be used by the metaslab preload taskq.
  */
 static uint_t metaslab_preload_pct = 50;
 
 static uint_t	zio_taskq_batch_pct = 80;	  /* 1 thread per cpu in pset */
 static uint_t	zio_taskq_batch_tpq;		  /* threads per taskq */
 
 #ifdef HAVE_SYSDC
 static const boolean_t	zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
 static const uint_t	zio_taskq_basedc = 80;	  /* base duty cycle */
 #endif
 
 #ifdef HAVE_SPA_THREAD
 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
 #endif
 
 static uint_t	zio_taskq_wr_iss_ncpus = 0;
 
 /*
  * Report any spa_load_verify errors found, but do not fail spa_load.
  * This is used by zdb to analyze non-idle pools.
  */
 boolean_t	spa_load_verify_dryrun = B_FALSE;
 
 /*
  * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ).
  * This is used by zdb for spacemaps verification.
  */
 boolean_t	spa_mode_readable_spacemaps = B_FALSE;
 
 /*
  * This (illegal) pool name is used when temporarily importing a spa_t in order
  * to get the vdev stats associated with the imported devices.
  */
 #define	TRYIMPORT_NAME	"$import"
 
 /*
  * For debugging purposes: print out vdev tree during pool import.
  */
 static int		spa_load_print_vdev_tree = B_FALSE;
 
 /*
  * A non-zero value for zfs_max_missing_tvds means that we allow importing
  * pools with missing top-level vdevs. This is strictly intended for advanced
  * pool recovery cases since missing data is almost inevitable. Pools with
  * missing devices can only be imported read-only for safety reasons, and their
  * fail-mode will be automatically set to "continue".
  *
  * With 1 missing vdev we should be able to import the pool and mount all
  * datasets. User data that was not modified after the missing device has been
  * added should be recoverable. This means that snapshots created prior to the
  * addition of that device should be completely intact.
  *
  * With 2 missing vdevs, some datasets may fail to mount since there are
  * dataset statistics that are stored as regular metadata. Some data might be
  * recoverable if those vdevs were added recently.
  *
  * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
  * may be missing entirely. Chances of data recovery are very low. Note that
  * there are also risks of performing an inadvertent rewind as we might be
  * missing all the vdevs with the latest uberblocks.
  */
 uint64_t	zfs_max_missing_tvds = 0;
 
 /*
  * The parameters below are similar to zfs_max_missing_tvds but are only
  * intended for a preliminary open of the pool with an untrusted config which
  * might be incomplete or out-dated.
  *
  * We are more tolerant for pools opened from a cachefile since we could have
  * an out-dated cachefile where a device removal was not registered.
  * We could have set the limit arbitrarily high but in the case where devices
  * are really missing we would want to return the proper error codes; we chose
  * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
  * and we get a chance to retrieve the trusted config.
  */
 uint64_t	zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
 
 /*
  * In the case where config was assembled by scanning device paths (/dev/dsks
  * by default) we are less tolerant since all the existing devices should have
  * been detected and we want spa_load to return the right error codes.
  */
 uint64_t	zfs_max_missing_tvds_scan = 0;
 
 /*
  * Debugging aid that pauses spa_sync() towards the end.
  */
 static const boolean_t	zfs_pause_spa_sync = B_FALSE;
 
 /*
  * Variables to indicate the livelist condense zthr func should wait at certain
  * points for the livelist to be removed - used to test condense/destroy races
  */
 static int zfs_livelist_condense_zthr_pause = 0;
 static int zfs_livelist_condense_sync_pause = 0;
 
 /*
  * Variables to track whether or not condense cancellation has been
  * triggered in testing.
  */
 static int zfs_livelist_condense_sync_cancel = 0;
 static int zfs_livelist_condense_zthr_cancel = 0;
 
 /*
  * Variable to track whether or not extra ALLOC blkptrs were added to a
  * livelist entry while it was being condensed (caused by the way we track
  * remapped blkptrs in dbuf_remap_impl)
  */
 static int zfs_livelist_condense_new_alloc = 0;
 
 /*
  * ==========================================================================
  * SPA properties routines
  * ==========================================================================
  */
 
 /*
  * Add a (source=src, propname=propval) list to an nvlist.
  */
 static void
 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval,
     uint64_t intval, zprop_source_t src)
 {
 	const char *propname = zpool_prop_to_name(prop);
 	nvlist_t *propval;
 
 	propval = fnvlist_alloc();
 	fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
 
 	if (strval != NULL)
 		fnvlist_add_string(propval, ZPROP_VALUE, strval);
 	else
 		fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
 
 	fnvlist_add_nvlist(nvl, propname, propval);
 	nvlist_free(propval);
 }
 
 /*
  * Add a user property (source=src, propname=propval) to an nvlist.
  */
 static void
 spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval,
     zprop_source_t src)
 {
 	nvlist_t *propval;
 
 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
 	VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
 	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
 	nvlist_free(propval);
 }
 
 /*
  * Get property values from the spa configuration.
  */
 static void
 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	dsl_pool_t *pool = spa->spa_dsl_pool;
 	uint64_t size, alloc, cap, version;
 	const zprop_source_t src = ZPROP_SRC_NONE;
 	spa_config_dirent_t *dp;
 	metaslab_class_t *mc = spa_normal_class(spa);
 
 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 
 	if (rvd != NULL) {
 		alloc = metaslab_class_get_alloc(mc);
 		alloc += metaslab_class_get_alloc(spa_special_class(spa));
 		alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
 		alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa));
 
 		size = metaslab_class_get_space(mc);
 		size += metaslab_class_get_space(spa_special_class(spa));
 		size += metaslab_class_get_space(spa_dedup_class(spa));
 		size += metaslab_class_get_space(spa_embedded_log_class(spa));
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
 		    size - alloc, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
 		    spa->spa_checkpoint_info.sci_dspace, src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
 		    metaslab_class_fragmentation(mc), src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
 		    metaslab_class_expandable_space(mc), src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
 		    (spa_mode(spa) == SPA_MODE_READ), src);
 
 		cap = (size == 0) ? 0 : (alloc * 100 / size);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
 		    ddt_get_pool_dedup_ratio(spa), src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL,
 		    brt_get_used(spa), src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL,
 		    brt_get_saved(spa), src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL,
 		    brt_get_ratio(spa), src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
 		    rvd->vdev_state, src);
 
 		version = spa_version(spa);
 		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
 			    version, ZPROP_SRC_DEFAULT);
 		} else {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
 			    version, ZPROP_SRC_LOCAL);
 		}
 		spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID,
 		    NULL, spa_load_guid(spa), src);
 	}
 
 	if (pool != NULL) {
 		/*
 		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
 		 * when opening pools before this version freedir will be NULL.
 		 */
 		if (pool->dp_free_dir != NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
 			    dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
 			    src);
 		} else {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
 			    NULL, 0, src);
 		}
 
 		if (pool->dp_leak_dir != NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
 			    dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
 			    src);
 		} else {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
 			    NULL, 0, src);
 		}
 	}
 
 	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 
 	if (spa->spa_comment != NULL) {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
 		    0, ZPROP_SRC_LOCAL);
 	}
 
 	if (spa->spa_compatibility != NULL) {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY,
 		    spa->spa_compatibility, 0, ZPROP_SRC_LOCAL);
 	}
 
 	if (spa->spa_root != NULL)
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
 		    0, ZPROP_SRC_LOCAL);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 		    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
 	} else {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 		    SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
 	}
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
 		    DNODE_MAX_SIZE, ZPROP_SRC_NONE);
 	} else {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
 		    DNODE_MIN_SIZE, ZPROP_SRC_NONE);
 	}
 
 	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 		if (dp->scd_path == NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 			    "none", 0, ZPROP_SRC_LOCAL);
 		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
 		}
 	}
 }
 
 /*
  * Get zpool property values.
  */
 int
 spa_prop_get(spa_t *spa, nvlist_t **nvp)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	dsl_pool_t *dp;
 	int err;
 
 	err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP);
 	if (err)
 		return (err);
 
 	dp = spa_get_dsl(spa);
 	dsl_pool_config_enter(dp, FTAG);
 	mutex_enter(&spa->spa_props_lock);
 
 	/*
 	 * Get properties from the spa config.
 	 */
 	spa_prop_get_config(spa, nvp);
 
 	/* If no pool property object, no more prop to get. */
 	if (mos == NULL || spa->spa_pool_props_object == 0)
 		goto out;
 
 	/*
 	 * Get properties from the MOS pool property object.
 	 */
 	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
 	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t intval = 0;
 		char *strval = NULL;
 		zprop_source_t src = ZPROP_SRC_DEFAULT;
 		zpool_prop_t prop;
 
 		if ((prop = zpool_name_to_prop(za.za_name)) ==
 		    ZPOOL_PROP_INVAL && !zfs_prop_user(za.za_name))
 			continue;
 
 		switch (za.za_integer_length) {
 		case 8:
 			/* integer property */
 			if (za.za_first_integer !=
 			    zpool_prop_default_numeric(prop))
 				src = ZPROP_SRC_LOCAL;
 
 			if (prop == ZPOOL_PROP_BOOTFS) {
 				dsl_dataset_t *ds = NULL;
 
 				err = dsl_dataset_hold_obj(dp,
 				    za.za_first_integer, FTAG, &ds);
 				if (err != 0)
 					break;
 
 				strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
 				    KM_SLEEP);
 				dsl_dataset_name(ds, strval);
 				dsl_dataset_rele(ds, FTAG);
 			} else {
 				strval = NULL;
 				intval = za.za_first_integer;
 			}
 
 			spa_prop_add_list(*nvp, prop, strval, intval, src);
 
 			if (strval != NULL)
 				kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
 
 			break;
 
 		case 1:
 			/* string property */
 			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
 			err = zap_lookup(mos, spa->spa_pool_props_object,
 			    za.za_name, 1, za.za_num_integers, strval);
 			if (err) {
 				kmem_free(strval, za.za_num_integers);
 				break;
 			}
 			if (prop != ZPOOL_PROP_INVAL) {
 				spa_prop_add_list(*nvp, prop, strval, 0, src);
 			} else {
 				src = ZPROP_SRC_LOCAL;
 				spa_prop_add_user(*nvp, za.za_name, strval,
 				    src);
 			}
 			kmem_free(strval, za.za_num_integers);
 			break;
 
 		default:
 			break;
 		}
 	}
 	zap_cursor_fini(&zc);
 out:
 	mutex_exit(&spa->spa_props_lock);
 	dsl_pool_config_exit(dp, FTAG);
 	if (err && err != ENOENT) {
 		nvlist_free(*nvp);
 		*nvp = NULL;
 		return (err);
 	}
 
 	return (0);
 }
 
 /*
  * Validate the given pool properties nvlist and modify the list
  * for the property values to be set.
  */
 static int
 spa_prop_validate(spa_t *spa, nvlist_t *props)
 {
 	nvpair_t *elem;
 	int error = 0, reset_bootfs = 0;
 	uint64_t objnum = 0;
 	boolean_t has_feature = B_FALSE;
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 		uint64_t intval;
 		const char *strval, *slash, *check, *fname;
 		const char *propname = nvpair_name(elem);
 		zpool_prop_t prop = zpool_name_to_prop(propname);
 
 		switch (prop) {
 		case ZPOOL_PROP_INVAL:
 			/*
 			 * Sanitize the input.
 			 */
 			if (zfs_prop_user(propname)) {
 				if (strlen(propname) >= ZAP_MAXNAMELEN) {
 					error = SET_ERROR(ENAMETOOLONG);
 					break;
 				}
 
 				if (strlen(fnvpair_value_string(elem)) >=
 				    ZAP_MAXVALUELEN) {
 					error = SET_ERROR(E2BIG);
 					break;
 				}
 			} else if (zpool_prop_feature(propname)) {
 				if (nvpair_type(elem) != DATA_TYPE_UINT64) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 
 				if (nvpair_value_uint64(elem, &intval) != 0) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 
 				if (intval != 0) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 
 				fname = strchr(propname, '@') + 1;
 				if (zfeature_lookup_name(fname, NULL) != 0) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 
 				has_feature = B_TRUE;
 			} else {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 			break;
 
 		case ZPOOL_PROP_VERSION:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error &&
 			    (intval < spa_version(spa) ||
 			    intval > SPA_VERSION_BEFORE_FEATURES ||
 			    has_feature))
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_DELEGATION:
 		case ZPOOL_PROP_AUTOREPLACE:
 		case ZPOOL_PROP_LISTSNAPS:
 		case ZPOOL_PROP_AUTOEXPAND:
 		case ZPOOL_PROP_AUTOTRIM:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > 1)
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_MULTIHOST:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > 1)
 				error = SET_ERROR(EINVAL);
 
 			if (!error) {
 				uint32_t hostid = zone_get_hostid(NULL);
 				if (hostid)
 					spa->spa_hostid = hostid;
 				else
 					error = SET_ERROR(ENOTSUP);
 			}
 
 			break;
 
 		case ZPOOL_PROP_BOOTFS:
 			/*
 			 * If the pool version is less than SPA_VERSION_BOOTFS,
 			 * or the pool is still being created (version == 0),
 			 * the bootfs property cannot be set.
 			 */
 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
 				error = SET_ERROR(ENOTSUP);
 				break;
 			}
 
 			/*
 			 * Make sure the vdev config is bootable
 			 */
 			if (!vdev_is_bootable(spa->spa_root_vdev)) {
 				error = SET_ERROR(ENOTSUP);
 				break;
 			}
 
 			reset_bootfs = 1;
 
 			error = nvpair_value_string(elem, &strval);
 
 			if (!error) {
 				objset_t *os;
 
 				if (strval == NULL || strval[0] == '\0') {
 					objnum = zpool_prop_default_numeric(
 					    ZPOOL_PROP_BOOTFS);
 					break;
 				}
 
 				error = dmu_objset_hold(strval, FTAG, &os);
 				if (error != 0)
 					break;
 
 				/* Must be ZPL. */
 				if (dmu_objset_type(os) != DMU_OST_ZFS) {
 					error = SET_ERROR(ENOTSUP);
 				} else {
 					objnum = dmu_objset_id(os);
 				}
 				dmu_objset_rele(os, FTAG);
 			}
 			break;
 
 		case ZPOOL_PROP_FAILUREMODE:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > ZIO_FAILURE_MODE_PANIC)
 				error = SET_ERROR(EINVAL);
 
 			/*
 			 * This is a special case which only occurs when
 			 * the pool has completely failed. This allows
 			 * the user to change the in-core failmode property
 			 * without syncing it out to disk (I/Os might
 			 * currently be blocked). We do this by returning
 			 * EIO to the caller (spa_prop_set) to trick it
 			 * into thinking we encountered a property validation
 			 * error.
 			 */
 			if (!error && spa_suspended(spa)) {
 				spa->spa_failmode = intval;
 				error = SET_ERROR(EIO);
 			}
 			break;
 
 		case ZPOOL_PROP_CACHEFILE:
 			if ((error = nvpair_value_string(elem, &strval)) != 0)
 				break;
 
 			if (strval[0] == '\0')
 				break;
 
 			if (strcmp(strval, "none") == 0)
 				break;
 
 			if (strval[0] != '/') {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			slash = strrchr(strval, '/');
 			ASSERT(slash != NULL);
 
 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
 			    strcmp(slash, "/..") == 0)
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_COMMENT:
 			if ((error = nvpair_value_string(elem, &strval)) != 0)
 				break;
 			for (check = strval; *check != '\0'; check++) {
 				if (!isprint(*check)) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 			}
 			if (strlen(strval) > ZPROP_MAX_COMMENT)
 				error = SET_ERROR(E2BIG);
 			break;
 
 		default:
 			break;
 		}
 
 		if (error)
 			break;
 	}
 
 	(void) nvlist_remove_all(props,
 	    zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO));
 
 	if (!error && reset_bootfs) {
 		error = nvlist_remove(props,
 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
 
 		if (!error) {
 			error = nvlist_add_uint64(props,
 			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
 		}
 	}
 
 	return (error);
 }
 
 void
 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
 {
 	const char *cachefile;
 	spa_config_dirent_t *dp;
 
 	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
 	    &cachefile) != 0)
 		return;
 
 	dp = kmem_alloc(sizeof (spa_config_dirent_t),
 	    KM_SLEEP);
 
 	if (cachefile[0] == '\0')
 		dp->scd_path = spa_strdup(spa_config_path);
 	else if (strcmp(cachefile, "none") == 0)
 		dp->scd_path = NULL;
 	else
 		dp->scd_path = spa_strdup(cachefile);
 
 	list_insert_head(&spa->spa_config_list, dp);
 	if (need_sync)
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 }
 
 int
 spa_prop_set(spa_t *spa, nvlist_t *nvp)
 {
 	int error;
 	nvpair_t *elem = NULL;
 	boolean_t need_sync = B_FALSE;
 
 	if ((error = spa_prop_validate(spa, nvp)) != 0)
 		return (error);
 
 	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
 		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
 
 		if (prop == ZPOOL_PROP_CACHEFILE ||
 		    prop == ZPOOL_PROP_ALTROOT ||
 		    prop == ZPOOL_PROP_READONLY)
 			continue;
 
 		if (prop == ZPOOL_PROP_INVAL &&
 		    zfs_prop_user(nvpair_name(elem))) {
 			need_sync = B_TRUE;
 			break;
 		}
 
 		if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
 			uint64_t ver = 0;
 
 			if (prop == ZPOOL_PROP_VERSION) {
 				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
 			} else {
 				ASSERT(zpool_prop_feature(nvpair_name(elem)));
 				ver = SPA_VERSION_FEATURES;
 				need_sync = B_TRUE;
 			}
 
 			/* Save time if the version is already set. */
 			if (ver == spa_version(spa))
 				continue;
 
 			/*
 			 * In addition to the pool directory object, we might
 			 * create the pool properties object, the features for
 			 * read object, the features for write object, or the
 			 * feature descriptions object.
 			 */
 			error = dsl_sync_task(spa->spa_name, NULL,
 			    spa_sync_version, &ver,
 			    6, ZFS_SPACE_CHECK_RESERVED);
 			if (error)
 				return (error);
 			continue;
 		}
 
 		need_sync = B_TRUE;
 		break;
 	}
 
 	if (need_sync) {
 		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
 		    nvp, 6, ZFS_SPACE_CHECK_RESERVED));
 	}
 
 	return (0);
 }
 
 /*
  * If the bootfs property value is dsobj, clear it.
  */
 void
 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 {
 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
 		VERIFY(zap_remove(spa->spa_meta_objset,
 		    spa->spa_pool_props_object,
 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
 		spa->spa_bootfs = 0;
 	}
 }
 
 static int
 spa_change_guid_check(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *newguid __maybe_unused = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t vdev_state;
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		int error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 		return (SET_ERROR(error));
 	}
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_state = rvd->vdev_state;
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (vdev_state != VDEV_STATE_HEALTHY)
 		return (SET_ERROR(ENXIO));
 
 	ASSERT3U(spa_guid(spa), !=, *newguid);
 
 	return (0);
 }
 
 static void
 spa_change_guid_sync(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *newguid = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	uint64_t oldguid;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	oldguid = spa_guid(spa);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	rvd->vdev_guid = *newguid;
 	rvd->vdev_guid_sum += (*newguid - oldguid);
 	vdev_config_dirty(rvd);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
 	    (u_longlong_t)oldguid, (u_longlong_t)*newguid);
 }
 
 /*
  * Change the GUID for the pool.  This is done so that we can later
  * re-import a pool built from a clone of our own vdevs.  We will modify
  * the root vdev's guid, our own pool guid, and then mark all of our
  * vdevs dirty.  Note that we must make sure that all our vdevs are
  * online when we do this, or else any vdevs that weren't present
  * would be orphaned from our pool.  We are also going to issue a
  * sysevent to update any watchers.
  */
 int
 spa_change_guid(spa_t *spa)
 {
 	int error;
 	uint64_t guid;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
 	guid = spa_generate_guid(NULL);
 
 	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
 	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
 
 	if (error == 0) {
 		/*
 		 * Clear the kobj flag from all the vdevs to allow
 		 * vdev_cache_process_kobj_evt() to post events to all the
 		 * vdevs since GUID is updated.
 		 */
 		vdev_clear_kobj_evt(spa->spa_root_vdev);
 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
 			vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]);
 
 		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
 	}
 
 	mutex_exit(&spa_namespace_lock);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * ==========================================================================
  * SPA state manipulation (open/create/destroy/import/export)
  * ==========================================================================
  */
 
 static int
 spa_error_entry_compare(const void *a, const void *b)
 {
 	const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
 	const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
 	int ret;
 
 	ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
 	    sizeof (zbookmark_phys_t));
 
 	return (TREE_ISIGN(ret));
 }
 
 /*
  * Utility function which retrieves copies of the current logs and
  * re-initializes them in the process.
  */
 void
 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
 {
 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
 
 	memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t));
 	memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t));
 
 	avl_create(&spa->spa_errlist_scrub,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_last,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 }
 
 static void
 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 {
 	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
 	enum zti_modes mode = ztip->zti_mode;
 	uint_t value = ztip->zti_value;
 	uint_t count = ztip->zti_count;
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	uint_t cpus, flags = TASKQ_DYNAMIC;
 
 	switch (mode) {
 	case ZTI_MODE_FIXED:
 		ASSERT3U(value, >, 0);
 		break;
 
 	case ZTI_MODE_SYNC:
 
 		/*
 		 * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus',
 		 * not to exceed the number of spa allocators.
 		 */
 		if (zio_taskq_wr_iss_ncpus == 0) {
 			count = MAX(boot_ncpus / spa->spa_alloc_count, 1);
 		} else {
 			count = MAX(1,
 			    boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus));
 		}
 		count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
 		count = MIN(count, spa->spa_alloc_count);
 
 		/*
 		 * zio_taskq_batch_pct is unbounded and may exceed 100%, but no
 		 * single taskq may have more threads than 100% of online cpus.
 		 */
 		value = (zio_taskq_batch_pct + count / 2) / count;
 		value = MIN(value, 100);
 		flags |= TASKQ_THREADS_CPU_PCT;
 		break;
 
 	case ZTI_MODE_SCALE:
 		flags |= TASKQ_THREADS_CPU_PCT;
 		/*
 		 * We want more taskqs to reduce lock contention, but we want
 		 * less for better request ordering and CPU utilization.
 		 */
 		cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
 		if (zio_taskq_batch_tpq > 0) {
 			count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) /
 			    zio_taskq_batch_tpq);
 		} else {
 			/*
 			 * Prefer 6 threads per taskq, but no more taskqs
 			 * than threads in them on large systems. For 80%:
 			 *
 			 *                 taskq   taskq   total
 			 * cpus    taskqs  percent threads threads
 			 * ------- ------- ------- ------- -------
 			 * 1       1       80%     1       1
 			 * 2       1       80%     1       1
 			 * 4       1       80%     3       3
 			 * 8       2       40%     3       6
 			 * 16      3       27%     4       12
 			 * 32      5       16%     5       25
 			 * 64      7       11%     7       49
 			 * 128     10      8%      10      100
 			 * 256     14      6%      15      210
 			 */
 			count = 1 + cpus / 6;
 			while (count * count > cpus)
 				count--;
 		}
 		/* Limit each taskq within 100% to not trigger assertion. */
 		count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
 		value = (zio_taskq_batch_pct + count / 2) / count;
 		break;
 
 	case ZTI_MODE_NULL:
 		tqs->stqs_count = 0;
 		tqs->stqs_taskq = NULL;
 		return;
 
 	default:
 		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
 		    "spa_taskqs_init()",
 		    zio_type_name[t], zio_taskq_types[q], mode, value);
 		break;
 	}
 
 	ASSERT3U(count, >, 0);
 	tqs->stqs_count = count;
 	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
 
 	for (uint_t i = 0; i < count; i++) {
 		taskq_t *tq;
 		char name[32];
 
 		if (count > 1)
 			(void) snprintf(name, sizeof (name), "%s_%s_%u",
 			    zio_type_name[t], zio_taskq_types[q], i);
 		else
 			(void) snprintf(name, sizeof (name), "%s_%s",
 			    zio_type_name[t], zio_taskq_types[q]);
 
 #ifdef HAVE_SYSDC
 		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
 			(void) zio_taskq_basedc;
 			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
 			    spa->spa_proc, zio_taskq_basedc, flags);
 		} else {
 #endif
 			pri_t pri = maxclsyspri;
 			/*
 			 * The write issue taskq can be extremely CPU
 			 * intensive.  Run it at slightly less important
 			 * priority than the other taskqs.
 			 *
 			 * Under Linux and FreeBSD this means incrementing
 			 * the priority value as opposed to platforms like
 			 * illumos where it should be decremented.
 			 *
 			 * On FreeBSD, if priorities divided by four (RQ_PPQ)
 			 * are equal then a difference between them is
 			 * insignificant.
 			 */
 			if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) {
 #if defined(__linux__)
 				pri++;
 #elif defined(__FreeBSD__)
 				pri += 4;
 #else
 #error "unknown OS"
 #endif
 			}
 			tq = taskq_create_proc(name, value, pri, 50,
 			    INT_MAX, spa->spa_proc, flags);
 #ifdef HAVE_SYSDC
 		}
 #endif
 
 		tqs->stqs_taskq[i] = tq;
 	}
 }
 
 static void
 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 
 	if (tqs->stqs_taskq == NULL) {
 		ASSERT3U(tqs->stqs_count, ==, 0);
 		return;
 	}
 
 	for (uint_t i = 0; i < tqs->stqs_count; i++) {
 		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
 		taskq_destroy(tqs->stqs_taskq[i]);
 	}
 
 	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
 	tqs->stqs_taskq = NULL;
 }
 
 #ifdef _KERNEL
 /*
  * The READ and WRITE rows of zio_taskqs are configurable at module load time
  * by setting zio_taskq_read or zio_taskq_write.
  *
  * Example (the defaults for READ and WRITE)
  *   zio_taskq_read='fixed,1,8 null scale null'
  *   zio_taskq_write='sync fixed,1,5 scale fixed,1,5'
  *
  * Each sets the entire row at a time.
  *
  * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number
  * of threads per taskq.
  *
  * 'null' can only be set on the high-priority queues (queue selection for
  * high-priority queues will fall back to the regular queue if the high-pri
  * is NULL.
  */
 static const char *const modes[ZTI_NMODES] = {
 	"fixed", "scale", "sync", "null"
 };
 
 /* Parse the incoming config string. Modifies cfg */
 static int
 spa_taskq_param_set(zio_type_t t, char *cfg)
 {
 	int err = 0;
 
 	zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}};
 
 	char *next = cfg, *tok, *c;
 
 	/*
 	 * Parse out each element from the string and fill `row`. The entire
 	 * row has to be set at once, so any errors are flagged by just
 	 * breaking out of this loop early.
 	 */
 	uint_t q;
 	for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
 		/* `next` is the start of the config */
 		if (next == NULL)
 			break;
 
 		/* Eat up leading space */
 		while (isspace(*next))
 			next++;
 		if (*next == '\0')
 			break;
 
 		/* Mode ends at space or end of string */
 		tok = next;
 		next = strchr(tok, ' ');
 		if (next != NULL) *next++ = '\0';
 
 		/* Parameters start after a comma */
 		c = strchr(tok, ',');
 		if (c != NULL) *c++ = '\0';
 
 		/* Match mode string */
 		uint_t mode;
 		for (mode = 0; mode < ZTI_NMODES; mode++)
 			if (strcmp(tok, modes[mode]) == 0)
 				break;
 		if (mode == ZTI_NMODES)
 			break;
 
 		/* Invalid canary */
 		row[q].zti_mode = ZTI_NMODES;
 
 		/* Per-mode setup */
 		switch (mode) {
 
 		/*
 		 * FIXED is parameterised: number of queues, and number of
 		 * threads per queue.
 		 */
 		case ZTI_MODE_FIXED: {
 			/* No parameters? */
 			if (c == NULL || *c == '\0')
 				break;
 
 			/* Find next parameter */
 			tok = c;
 			c = strchr(tok, ',');
 			if (c == NULL)
 				break;
 
 			/* Take digits and convert */
 			unsigned long long nq;
 			if (!(isdigit(*tok)))
 				break;
 			err = ddi_strtoull(tok, &tok, 10, &nq);
 			/* Must succeed and also end at the next param sep */
 			if (err != 0 || tok != c)
 				break;
 
 			/* Move past the comma */
 			tok++;
 			/* Need another number */
 			if (!(isdigit(*tok)))
 				break;
 			/* Remember start to make sure we moved */
 			c = tok;
 
 			/* Take digits */
 			unsigned long long ntpq;
 			err = ddi_strtoull(tok, &tok, 10, &ntpq);
 			/* Must succeed, and moved forward */
 			if (err != 0 || tok == c || *tok != '\0')
 				break;
 
 			/*
 			 * sanity; zero queues/threads make no sense, and
 			 * 16K is almost certainly more than anyone will ever
 			 * need and avoids silly numbers like UINT32_MAX
 			 */
 			if (nq == 0 || nq >= 16384 ||
 			    ntpq == 0 || ntpq >= 16384)
 				break;
 
 			const zio_taskq_info_t zti = ZTI_P(ntpq, nq);
 			row[q] = zti;
 			break;
 		}
 
 		case ZTI_MODE_SCALE: {
 			const zio_taskq_info_t zti = ZTI_SCALE;
 			row[q] = zti;
 			break;
 		}
 
 		case ZTI_MODE_SYNC: {
 			const zio_taskq_info_t zti = ZTI_SYNC;
 			row[q] = zti;
 			break;
 		}
 
 		case ZTI_MODE_NULL: {
 			/*
 			 * Can only null the high-priority queues; the general-
 			 * purpose ones have to exist.
 			 */
 			if (q != ZIO_TASKQ_ISSUE_HIGH &&
 			    q != ZIO_TASKQ_INTERRUPT_HIGH)
 				break;
 
 			const zio_taskq_info_t zti = ZTI_NULL;
 			row[q] = zti;
 			break;
 		}
 
 		default:
 			break;
 		}
 
 		/* Ensure we set a mode */
 		if (row[q].zti_mode == ZTI_NMODES)
 			break;
 	}
 
 	/* Didn't get a full row, fail */
 	if (q < ZIO_TASKQ_TYPES)
 		return (SET_ERROR(EINVAL));
 
 	/* Eat trailing space */
 	if (next != NULL)
 		while (isspace(*next))
 			next++;
 
 	/* If there's anything left over then fail */
 	if (next != NULL && *next != '\0')
 		return (SET_ERROR(EINVAL));
 
 	/* Success! Copy it into the real config */
 	for (q = 0; q < ZIO_TASKQ_TYPES; q++)
 		zio_taskqs[t][q] = row[q];
 
 	return (0);
 }
 
 static int
 spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline)
 {
 	int pos = 0;
 
 	/* Build paramater string from live config */
 	const char *sep = "";
 	for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) {
 		const zio_taskq_info_t *zti = &zio_taskqs[t][q];
 		if (zti->zti_mode == ZTI_MODE_FIXED)
 			pos += sprintf(&buf[pos], "%s%s,%u,%u", sep,
 			    modes[zti->zti_mode], zti->zti_count,
 			    zti->zti_value);
 		else
 			pos += sprintf(&buf[pos], "%s%s", sep,
 			    modes[zti->zti_mode]);
 		sep = " ";
 	}
 
 	if (add_newline)
 		buf[pos++] = '\n';
 	buf[pos] = '\0';
 
 	return (pos);
 }
 
 #ifdef __linux__
 static int
 spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp)
 {
 	char *cfg = kmem_strdup(val);
 	int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg);
 	kmem_free(cfg, strlen(val)+1);
 	return (-err);
 }
 static int
 spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp)
 {
 	return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE));
 }
 
 static int
 spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp)
 {
 	char *cfg = kmem_strdup(val);
 	int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg);
 	kmem_free(cfg, strlen(val)+1);
 	return (-err);
 }
 static int
 spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp)
 {
 	return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE));
 }
 #else
 /*
  * On FreeBSD load-time parameters can be set up before malloc() is available,
  * so we have to do all the parsing work on the stack.
  */
 #define	SPA_TASKQ_PARAM_MAX	(128)
 
 static int
 spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS)
 {
 	char buf[SPA_TASKQ_PARAM_MAX];
 	int err;
 
 	(void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE);
 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 	if (err || req->newptr == NULL)
 		return (err);
 	return (spa_taskq_param_set(ZIO_TYPE_READ, buf));
 }
 
 static int
 spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS)
 {
 	char buf[SPA_TASKQ_PARAM_MAX];
 	int err;
 
 	(void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE);
 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 	if (err || req->newptr == NULL)
 		return (err);
 	return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf));
 }
 #endif
 #endif /* _KERNEL */
 
 /*
  * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
  * Note that a type may have multiple discrete taskqs to avoid lock contention
  * on the taskq itself.
  */
 static taskq_t *
 spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
     zio_t *zio)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	taskq_t *tq;
 
 	ASSERT3P(tqs->stqs_taskq, !=, NULL);
 	ASSERT3U(tqs->stqs_count, !=, 0);
 
 	if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
 	    (zio != NULL) && (zio->io_wr_iss_tq != NULL)) {
 		/* dispatch to assigned write issue taskq */
 		tq = zio->io_wr_iss_tq;
 		return (tq);
 	}
 
 	if (tqs->stqs_count == 1) {
 		tq = tqs->stqs_taskq[0];
 	} else {
 		tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
 	}
 	return (tq);
 }
 
 void
 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
     task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent,
     zio_t *zio)
 {
 	taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, zio);
 	taskq_dispatch_ent(tq, func, arg, flags, ent);
 }
 
 /*
  * Same as spa_taskq_dispatch_ent() but block on the task until completion.
  */
 void
 spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
     task_func_t *func, void *arg, uint_t flags)
 {
 	taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, NULL);
 	taskqid_t id = taskq_dispatch(tq, func, arg, flags);
 	if (id)
 		taskq_wait_id(tq, id);
 }
 
 static void
 spa_create_zio_taskqs(spa_t *spa)
 {
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 			spa_taskqs_init(spa, t, q);
 		}
 	}
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
 static void
 spa_thread(void *arg)
 {
 	psetid_t zio_taskq_psrset_bind = PS_NONE;
 	callb_cpr_t cprinfo;
 
 	spa_t *spa = arg;
 	user_t *pu = PTOU(curproc);
 
 	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
 	    spa->spa_name);
 
 	ASSERT(curproc != &p0);
 	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
 	    "zpool-%s", spa->spa_name);
 	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
 
 	/* bind this thread to the requested psrset */
 	if (zio_taskq_psrset_bind != PS_NONE) {
 		pool_lock();
 		mutex_enter(&cpu_lock);
 		mutex_enter(&pidlock);
 		mutex_enter(&curproc->p_lock);
 
 		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
 		    0, NULL, NULL) == 0)  {
 			curthread->t_bind_pset = zio_taskq_psrset_bind;
 		} else {
 			cmn_err(CE_WARN,
 			    "Couldn't bind process for zfs pool \"%s\" to "
 			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
 		}
 
 		mutex_exit(&curproc->p_lock);
 		mutex_exit(&pidlock);
 		mutex_exit(&cpu_lock);
 		pool_unlock();
 	}
 
 #ifdef HAVE_SYSDC
 	if (zio_taskq_sysdc) {
 		sysdc_thread_enter(curthread, 100, 0);
 	}
 #endif
 
 	spa->spa_proc = curproc;
 	spa->spa_did = curthread->t_did;
 
 	spa_create_zio_taskqs(spa);
 
 	mutex_enter(&spa->spa_proc_lock);
 	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
 
 	spa->spa_proc_state = SPA_PROC_ACTIVE;
 	cv_broadcast(&spa->spa_proc_cv);
 
 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
 	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
 		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
 
 	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
 	spa->spa_proc_state = SPA_PROC_GONE;
 	spa->spa_proc = &p0;
 	cv_broadcast(&spa->spa_proc_cv);
 	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
 
 	mutex_enter(&curproc->p_lock);
 	lwp_exit();
 }
 #endif
 
 extern metaslab_ops_t *metaslab_allocator(spa_t *spa);
 
 /*
  * Activate an uninitialized pool.
  */
 static void
 spa_activate(spa_t *spa, spa_mode_t mode)
 {
 	metaslab_ops_t *msp = metaslab_allocator(spa);
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_mode = mode;
 	spa->spa_read_spacemaps = spa_mode_readable_spacemaps;
 
 	spa->spa_normal_class = metaslab_class_create(spa, msp);
 	spa->spa_log_class = metaslab_class_create(spa, msp);
 	spa->spa_embedded_log_class = metaslab_class_create(spa, msp);
 	spa->spa_special_class = metaslab_class_create(spa, msp);
 	spa->spa_dedup_class = metaslab_class_create(spa, msp);
 
 	/* Try to create a covering process */
 	mutex_enter(&spa->spa_proc_lock);
 	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
 	ASSERT(spa->spa_proc == &p0);
 	spa->spa_did = 0;
 
 #ifdef HAVE_SPA_THREAD
 	/* Only create a process if we're going to be around a while. */
 	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
 		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
 		    NULL, 0) == 0) {
 			spa->spa_proc_state = SPA_PROC_CREATED;
 			while (spa->spa_proc_state == SPA_PROC_CREATED) {
 				cv_wait(&spa->spa_proc_cv,
 				    &spa->spa_proc_lock);
 			}
 			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 			ASSERT(spa->spa_proc != &p0);
 			ASSERT(spa->spa_did != 0);
 		} else {
 #ifdef _KERNEL
 			cmn_err(CE_WARN,
 			    "Couldn't create process for zfs pool \"%s\"\n",
 			    spa->spa_name);
 #endif
 		}
 	}
 #endif /* HAVE_SPA_THREAD */
 	mutex_exit(&spa->spa_proc_lock);
 
 	/* If we didn't create a process, we need to create our taskqs. */
 	if (spa->spa_proc == &p0) {
 		spa_create_zio_taskqs(spa);
 	}
 
 	for (size_t i = 0; i < TXG_SIZE; i++) {
 		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 	}
 
 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_config_dirty_node));
 	list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
 	    offsetof(objset_t, os_evicting_node));
 	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_state_dirty_node));
 
 	txg_list_create(&spa->spa_vdev_txg_list, spa,
 	    offsetof(struct vdev, vdev_txg_node));
 
 	avl_create(&spa->spa_errlist_scrub,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_last,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_healed,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 
 	spa_activate_os(spa);
 
 	spa_keystore_init(&spa->spa_keystore);
 
 	/*
 	 * This taskq is used to perform zvol-minor-related tasks
 	 * asynchronously. This has several advantages, including easy
 	 * resolution of various deadlocks.
 	 *
 	 * The taskq must be single threaded to ensure tasks are always
 	 * processed in the order in which they were dispatched.
 	 *
 	 * A taskq per pool allows one to keep the pools independent.
 	 * This way if one pool is suspended, it will not impact another.
 	 *
 	 * The preferred location to dispatch a zvol minor task is a sync
 	 * task. In this context, there is easy access to the spa_t and minimal
 	 * error handling is required because the sync task must succeed.
 	 */
 	spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
 	    1, INT_MAX, 0);
 
 	/*
 	 * The taskq to preload metaslabs.
 	 */
 	spa->spa_metaslab_taskq = taskq_create("z_metaslab",
 	    metaslab_preload_pct, maxclsyspri, 1, INT_MAX,
 	    TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 
 	/*
 	 * Taskq dedicated to prefetcher threads: this is used to prevent the
 	 * pool traverse code from monopolizing the global (and limited)
 	 * system_taskq by inappropriately scheduling long running tasks on it.
 	 */
 	spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100,
 	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 
 	/*
 	 * The taskq to upgrade datasets in this pool. Currently used by
 	 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA.
 	 */
 	spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100,
 	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 }
 
 /*
  * Opposite of spa_activate().
  */
 static void
 spa_deactivate(spa_t *spa)
 {
 	ASSERT(spa->spa_sync_on == B_FALSE);
 	ASSERT(spa->spa_dsl_pool == NULL);
 	ASSERT(spa->spa_root_vdev == NULL);
 	ASSERT(spa->spa_async_zio_root == NULL);
 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
 
 	spa_evicting_os_wait(spa);
 
 	if (spa->spa_zvol_taskq) {
 		taskq_destroy(spa->spa_zvol_taskq);
 		spa->spa_zvol_taskq = NULL;
 	}
 
 	if (spa->spa_metaslab_taskq) {
 		taskq_destroy(spa->spa_metaslab_taskq);
 		spa->spa_metaslab_taskq = NULL;
 	}
 
 	if (spa->spa_prefetch_taskq) {
 		taskq_destroy(spa->spa_prefetch_taskq);
 		spa->spa_prefetch_taskq = NULL;
 	}
 
 	if (spa->spa_upgrade_taskq) {
 		taskq_destroy(spa->spa_upgrade_taskq);
 		spa->spa_upgrade_taskq = NULL;
 	}
 
 	txg_list_destroy(&spa->spa_vdev_txg_list);
 
 	list_destroy(&spa->spa_config_dirty_list);
 	list_destroy(&spa->spa_evicting_os_list);
 	list_destroy(&spa->spa_state_dirty_list);
 
 	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
 
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 			spa_taskqs_fini(spa, t, q);
 		}
 	}
 
 	for (size_t i = 0; i < TXG_SIZE; i++) {
 		ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
 		VERIFY0(zio_wait(spa->spa_txg_zio[i]));
 		spa->spa_txg_zio[i] = NULL;
 	}
 
 	metaslab_class_destroy(spa->spa_normal_class);
 	spa->spa_normal_class = NULL;
 
 	metaslab_class_destroy(spa->spa_log_class);
 	spa->spa_log_class = NULL;
 
 	metaslab_class_destroy(spa->spa_embedded_log_class);
 	spa->spa_embedded_log_class = NULL;
 
 	metaslab_class_destroy(spa->spa_special_class);
 	spa->spa_special_class = NULL;
 
 	metaslab_class_destroy(spa->spa_dedup_class);
 	spa->spa_dedup_class = NULL;
 
 	/*
 	 * If this was part of an import or the open otherwise failed, we may
 	 * still have errors left in the queues.  Empty them just in case.
 	 */
 	spa_errlog_drain(spa);
 	avl_destroy(&spa->spa_errlist_scrub);
 	avl_destroy(&spa->spa_errlist_last);
 	avl_destroy(&spa->spa_errlist_healed);
 
 	spa_keystore_fini(&spa->spa_keystore);
 
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 
 	mutex_enter(&spa->spa_proc_lock);
 	if (spa->spa_proc_state != SPA_PROC_NONE) {
 		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
 		cv_broadcast(&spa->spa_proc_cv);
 		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
 			ASSERT(spa->spa_proc != &p0);
 			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 		}
 		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
 		spa->spa_proc_state = SPA_PROC_NONE;
 	}
 	ASSERT(spa->spa_proc == &p0);
 	mutex_exit(&spa->spa_proc_lock);
 
 	/*
 	 * We want to make sure spa_thread() has actually exited the ZFS
 	 * module, so that the module can't be unloaded out from underneath
 	 * it.
 	 */
 	if (spa->spa_did != 0) {
 		thread_join(spa->spa_did);
 		spa->spa_did = 0;
 	}
 
 	spa_deactivate_os(spa);
 
 }
 
 /*
  * Verify a pool configuration, and construct the vdev tree appropriately.  This
  * will create all the necessary vdevs in the appropriate layout, with each vdev
  * in the CLOSED state.  This will prep the pool before open/creation/import.
  * All vdev validation is done by the vdev_alloc() routine.
  */
 int
 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
     uint_t id, int atype)
 {
 	nvlist_t **child;
 	uint_t children;
 	int error;
 
 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
 		return (error);
 
 	if ((*vdp)->vdev_ops->vdev_op_leaf)
 		return (0);
 
 	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children);
 
 	if (error == ENOENT)
 		return (0);
 
 	if (error) {
 		vdev_free(*vdp);
 		*vdp = NULL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	for (int c = 0; c < children; c++) {
 		vdev_t *vd;
 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
 		    atype)) != 0) {
 			vdev_free(*vdp);
 			*vdp = NULL;
 			return (error);
 		}
 	}
 
 	ASSERT(*vdp != NULL);
 
 	return (0);
 }
 
 static boolean_t
 spa_should_flush_logs_on_unload(spa_t *spa)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return (B_FALSE);
 
 	if (!spa_writeable(spa))
 		return (B_FALSE);
 
 	if (!spa->spa_sync_on)
 		return (B_FALSE);
 
 	if (spa_state(spa) != POOL_STATE_EXPORTED)
 		return (B_FALSE);
 
 	if (zfs_keep_log_spacemaps_at_export)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 /*
  * Opens a transaction that will set the flag that will instruct
  * spa_sync to attempt to flush all the metaslabs for that txg.
  */
 static void
 spa_unload_log_sm_flush_all(spa_t *spa)
 {
 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 
 	ASSERT3U(spa->spa_log_flushall_txg, ==, 0);
 	spa->spa_log_flushall_txg = dmu_tx_get_txg(tx);
 
 	dmu_tx_commit(tx);
 	txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg);
 }
 
 static void
 spa_unload_log_sm_metadata(spa_t *spa)
 {
 	void *cookie = NULL;
 	spa_log_sm_t *sls;
 	log_summary_entry_t *e;
 
 	while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
 	    &cookie)) != NULL) {
 		VERIFY0(sls->sls_mscount);
 		kmem_free(sls, sizeof (spa_log_sm_t));
 	}
 
 	while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) {
 		VERIFY0(e->lse_mscount);
 		kmem_free(e, sizeof (log_summary_entry_t));
 	}
 
 	spa->spa_unflushed_stats.sus_nblocks = 0;
 	spa->spa_unflushed_stats.sus_memused = 0;
 	spa->spa_unflushed_stats.sus_blocklimit = 0;
 }
 
 static void
 spa_destroy_aux_threads(spa_t *spa)
 {
 	if (spa->spa_condense_zthr != NULL) {
 		zthr_destroy(spa->spa_condense_zthr);
 		spa->spa_condense_zthr = NULL;
 	}
 	if (spa->spa_checkpoint_discard_zthr != NULL) {
 		zthr_destroy(spa->spa_checkpoint_discard_zthr);
 		spa->spa_checkpoint_discard_zthr = NULL;
 	}
 	if (spa->spa_livelist_delete_zthr != NULL) {
 		zthr_destroy(spa->spa_livelist_delete_zthr);
 		spa->spa_livelist_delete_zthr = NULL;
 	}
 	if (spa->spa_livelist_condense_zthr != NULL) {
 		zthr_destroy(spa->spa_livelist_condense_zthr);
 		spa->spa_livelist_condense_zthr = NULL;
 	}
 	if (spa->spa_raidz_expand_zthr != NULL) {
 		zthr_destroy(spa->spa_raidz_expand_zthr);
 		spa->spa_raidz_expand_zthr = NULL;
 	}
 }
 
 /*
  * Opposite of spa_load().
  */
 static void
 spa_unload(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
 
 	spa_import_progress_remove(spa_guid(spa));
 	spa_load_note(spa, "UNLOADING");
 
 	spa_wake_waiters(spa);
 
 	/*
 	 * If we have set the spa_final_txg, we have already performed the
 	 * tasks below in spa_export_common(). We should not redo it here since
 	 * we delay the final TXGs beyond what spa_final_txg is set at.
 	 */
 	if (spa->spa_final_txg == UINT64_MAX) {
 		/*
 		 * If the log space map feature is enabled and the pool is
 		 * getting exported (but not destroyed), we want to spend some
 		 * time flushing as many metaslabs as we can in an attempt to
 		 * destroy log space maps and save import time.
 		 */
 		if (spa_should_flush_logs_on_unload(spa))
 			spa_unload_log_sm_flush_all(spa);
 
 		/*
 		 * Stop async tasks.
 		 */
 		spa_async_suspend(spa);
 
 		if (spa->spa_root_vdev) {
 			vdev_t *root_vdev = spa->spa_root_vdev;
 			vdev_initialize_stop_all(root_vdev,
 			    VDEV_INITIALIZE_ACTIVE);
 			vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
 			vdev_autotrim_stop_all(spa);
 			vdev_rebuild_stop_all(spa);
 		}
 	}
 
 	/*
 	 * Stop syncing.
 	 */
 	if (spa->spa_sync_on) {
 		txg_sync_stop(spa->spa_dsl_pool);
 		spa->spa_sync_on = B_FALSE;
 	}
 
 	/*
 	 * This ensures that there is no async metaslab prefetching
 	 * while we attempt to unload the spa.
 	 */
 	taskq_wait(spa->spa_metaslab_taskq);
 
 	if (spa->spa_mmp.mmp_thread)
 		mmp_thread_stop(spa);
 
 	/*
 	 * Wait for any outstanding async I/O to complete.
 	 */
 	if (spa->spa_async_zio_root != NULL) {
 		for (int i = 0; i < max_ncpus; i++)
 			(void) zio_wait(spa->spa_async_zio_root[i]);
 		kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
 		spa->spa_async_zio_root = NULL;
 	}
 
 	if (spa->spa_vdev_removal != NULL) {
 		spa_vdev_removal_destroy(spa->spa_vdev_removal);
 		spa->spa_vdev_removal = NULL;
 	}
 
 	spa_destroy_aux_threads(spa);
 
 	spa_condense_fini(spa);
 
 	bpobj_close(&spa->spa_deferred_bpobj);
 
 	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 
 	/*
 	 * Close all vdevs.
 	 */
 	if (spa->spa_root_vdev)
 		vdev_free(spa->spa_root_vdev);
 	ASSERT(spa->spa_root_vdev == NULL);
 
 	/*
 	 * Close the dsl pool.
 	 */
 	if (spa->spa_dsl_pool) {
 		dsl_pool_close(spa->spa_dsl_pool);
 		spa->spa_dsl_pool = NULL;
 		spa->spa_meta_objset = NULL;
 	}
 
 	ddt_unload(spa);
 	brt_unload(spa);
 	spa_unload_log_sm_metadata(spa);
 
 	/*
 	 * Drop and purge level 2 cache
 	 */
 	spa_l2cache_drop(spa);
 
 	if (spa->spa_spares.sav_vdevs) {
 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
 			vdev_free(spa->spa_spares.sav_vdevs[i]);
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 		spa->spa_spares.sav_vdevs = NULL;
 	}
 	if (spa->spa_spares.sav_config) {
 		nvlist_free(spa->spa_spares.sav_config);
 		spa->spa_spares.sav_config = NULL;
 	}
 	spa->spa_spares.sav_count = 0;
 
 	if (spa->spa_l2cache.sav_vdevs) {
 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 			vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
 			vdev_free(spa->spa_l2cache.sav_vdevs[i]);
 		}
 		kmem_free(spa->spa_l2cache.sav_vdevs,
 		    spa->spa_l2cache.sav_count * sizeof (void *));
 		spa->spa_l2cache.sav_vdevs = NULL;
 	}
 	if (spa->spa_l2cache.sav_config) {
 		nvlist_free(spa->spa_l2cache.sav_config);
 		spa->spa_l2cache.sav_config = NULL;
 	}
 	spa->spa_l2cache.sav_count = 0;
 
 	spa->spa_async_suspended = 0;
 
 	spa->spa_indirect_vdevs_loaded = B_FALSE;
 
 	if (spa->spa_comment != NULL) {
 		spa_strfree(spa->spa_comment);
 		spa->spa_comment = NULL;
 	}
 	if (spa->spa_compatibility != NULL) {
 		spa_strfree(spa->spa_compatibility);
 		spa->spa_compatibility = NULL;
 	}
 
 	spa->spa_raidz_expand = NULL;
 
 	spa_config_exit(spa, SCL_ALL, spa);
 }
 
 /*
  * Load (or re-load) the current list of vdevs describing the active spares for
  * this pool.  When this is called, we have some form of basic information in
  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
  * then re-generate a more complete list including status information.
  */
 void
 spa_load_spares(spa_t *spa)
 {
 	nvlist_t **spares;
 	uint_t nspares;
 	int i;
 	vdev_t *vd, *tvd;
 
 #ifndef _KERNEL
 	/*
 	 * zdb opens both the current state of the pool and the
 	 * checkpointed state (if present), with a different spa_t.
 	 *
 	 * As spare vdevs are shared among open pools, we skip loading
 	 * them when we load the checkpointed state of the pool.
 	 */
 	if (!spa_writeable(spa))
 		return;
 #endif
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/*
 	 * First, close and free any existing spare vdevs.
 	 */
 	if (spa->spa_spares.sav_vdevs) {
 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
 			vd = spa->spa_spares.sav_vdevs[i];
 
 			/* Undo the call to spa_activate() below */
 			if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 			    B_FALSE)) != NULL && tvd->vdev_isspare)
 				spa_spare_remove(tvd);
 			vdev_close(vd);
 			vdev_free(vd);
 		}
 
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 	}
 
 	if (spa->spa_spares.sav_config == NULL)
 		nspares = 0;
 	else
 		VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, &spares, &nspares));
 
 	spa->spa_spares.sav_count = (int)nspares;
 	spa->spa_spares.sav_vdevs = NULL;
 
 	if (nspares == 0)
 		return;
 
 	/*
 	 * Construct the array of vdevs, opening them to get status in the
 	 * process.   For each spare, there is potentially two different vdev_t
 	 * structures associated with it: one in the list of spares (used only
 	 * for basic validation purposes) and one in the active vdev
 	 * configuration (if it's spared in).  During this phase we open and
 	 * validate each vdev on the spare list.  If the vdev also exists in the
 	 * active configuration, then we also mark this vdev as an active spare.
 	 */
 	spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *),
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
 		    VDEV_ALLOC_SPARE) == 0);
 		ASSERT(vd != NULL);
 
 		spa->spa_spares.sav_vdevs[i] = vd;
 
 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 		    B_FALSE)) != NULL) {
 			if (!tvd->vdev_isspare)
 				spa_spare_add(tvd);
 
 			/*
 			 * We only mark the spare active if we were successfully
 			 * able to load the vdev.  Otherwise, importing a pool
 			 * with a bad active spare would result in strange
 			 * behavior, because multiple pool would think the spare
 			 * is actively in use.
 			 *
 			 * There is a vulnerability here to an equally bizarre
 			 * circumstance, where a dead active spare is later
 			 * brought back to life (onlined or otherwise).  Given
 			 * the rarity of this scenario, and the extra complexity
 			 * it adds, we ignore the possibility.
 			 */
 			if (!vdev_is_dead(tvd))
 				spa_spare_activate(tvd);
 		}
 
 		vd->vdev_top = vd;
 		vd->vdev_aux = &spa->spa_spares;
 
 		if (vdev_open(vd) != 0)
 			continue;
 
 		if (vdev_validate_aux(vd) == 0)
 			spa_spare_add(vd);
 	}
 
 	/*
 	 * Recompute the stashed list of spares, with status information
 	 * this time.
 	 */
 	fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES);
 
 	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		spares[i] = vdev_config_generate(spa,
 		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
 	fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
 	    spa->spa_spares.sav_count);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		nvlist_free(spares[i]);
 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
 }
 
 /*
  * Load (or re-load) the current list of vdevs describing the active l2cache for
  * this pool.  When this is called, we have some form of basic information in
  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
  * then re-generate a more complete list including status information.
  * Devices which are already active have their details maintained, and are
  * not re-opened.
  */
 void
 spa_load_l2cache(spa_t *spa)
 {
 	nvlist_t **l2cache = NULL;
 	uint_t nl2cache;
 	int i, j, oldnvdevs;
 	uint64_t guid;
 	vdev_t *vd, **oldvdevs, **newvdevs;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 #ifndef _KERNEL
 	/*
 	 * zdb opens both the current state of the pool and the
 	 * checkpointed state (if present), with a different spa_t.
 	 *
 	 * As L2 caches are part of the ARC which is shared among open
 	 * pools, we skip loading them when we load the checkpointed
 	 * state of the pool.
 	 */
 	if (!spa_writeable(spa))
 		return;
 #endif
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	oldvdevs = sav->sav_vdevs;
 	oldnvdevs = sav->sav_count;
 	sav->sav_vdevs = NULL;
 	sav->sav_count = 0;
 
 	if (sav->sav_config == NULL) {
 		nl2cache = 0;
 		newvdevs = NULL;
 		goto out;
 	}
 
 	VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
 	newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
 
 	/*
 	 * Process new nvlist of vdevs.
 	 */
 	for (i = 0; i < nl2cache; i++) {
 		guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID);
 
 		newvdevs[i] = NULL;
 		for (j = 0; j < oldnvdevs; j++) {
 			vd = oldvdevs[j];
 			if (vd != NULL && guid == vd->vdev_guid) {
 				/*
 				 * Retain previous vdev for add/remove ops.
 				 */
 				newvdevs[i] = vd;
 				oldvdevs[j] = NULL;
 				break;
 			}
 		}
 
 		if (newvdevs[i] == NULL) {
 			/*
 			 * Create new vdev
 			 */
 			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
 			    VDEV_ALLOC_L2CACHE) == 0);
 			ASSERT(vd != NULL);
 			newvdevs[i] = vd;
 
 			/*
 			 * Commit this vdev as an l2cache device,
 			 * even if it fails to open.
 			 */
 			spa_l2cache_add(vd);
 
 			vd->vdev_top = vd;
 			vd->vdev_aux = sav;
 
 			spa_l2cache_activate(vd);
 
 			if (vdev_open(vd) != 0)
 				continue;
 
 			(void) vdev_validate_aux(vd);
 
 			if (!vdev_is_dead(vd))
 				l2arc_add_vdev(spa, vd);
 
 			/*
 			 * Upon cache device addition to a pool or pool
 			 * creation with a cache device or if the header
 			 * of the device is invalid we issue an async
 			 * TRIM command for the whole device which will
 			 * execute if l2arc_trim_ahead > 0.
 			 */
 			spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
 		}
 	}
 
 	sav->sav_vdevs = newvdevs;
 	sav->sav_count = (int)nl2cache;
 
 	/*
 	 * Recompute the stashed list of l2cache devices, with status
 	 * information this time.
 	 */
 	fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE);
 
 	if (sav->sav_count > 0)
 		l2cache = kmem_alloc(sav->sav_count * sizeof (void *),
 		    KM_SLEEP);
 	for (i = 0; i < sav->sav_count; i++)
 		l2cache[i] = vdev_config_generate(spa,
 		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
 	fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
 	    (const nvlist_t * const *)l2cache, sav->sav_count);
 
 out:
 	/*
 	 * Purge vdevs that were dropped
 	 */
 	if (oldvdevs) {
 		for (i = 0; i < oldnvdevs; i++) {
 			uint64_t pool;
 
 			vd = oldvdevs[i];
 			if (vd != NULL) {
 				ASSERT(vd->vdev_isl2cache);
 
 				if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 				    pool != 0ULL && l2arc_vdev_present(vd))
 					l2arc_remove_vdev(vd);
 				vdev_clear_stats(vd);
 				vdev_free(vd);
 			}
 		}
 
 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
 	}
 
 	for (i = 0; i < sav->sav_count; i++)
 		nvlist_free(l2cache[i]);
 	if (sav->sav_count)
 		kmem_free(l2cache, sav->sav_count * sizeof (void *));
 }
 
 static int
 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
 {
 	dmu_buf_t *db;
 	char *packed = NULL;
 	size_t nvsize = 0;
 	int error;
 	*value = NULL;
 
 	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
 	if (error)
 		return (error);
 
 	nvsize = *(uint64_t *)db->db_data;
 	dmu_buf_rele(db, FTAG);
 
 	packed = vmem_alloc(nvsize, KM_SLEEP);
 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
 	    DMU_READ_PREFETCH);
 	if (error == 0)
 		error = nvlist_unpack(packed, nvsize, value, 0);
 	vmem_free(packed, nvsize);
 
 	return (error);
 }
 
 /*
  * Concrete top-level vdevs that are not missing and are not logs. At every
  * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
  */
 static uint64_t
 spa_healthy_core_tvds(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t tvds = 0;
 
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *vd = rvd->vdev_child[i];
 		if (vd->vdev_islog)
 			continue;
 		if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
 			tvds++;
 	}
 
 	return (tvds);
 }
 
 /*
  * Checks to see if the given vdev could not be opened, in which case we post a
  * sysevent to notify the autoreplace code that the device has been removed.
  */
 static void
 spa_check_removed(vdev_t *vd)
 {
 	for (uint64_t c = 0; c < vd->vdev_children; c++)
 		spa_check_removed(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
 	    vdev_is_concrete(vd)) {
 		zfs_post_autoreplace(vd->vdev_spa, vd);
 		spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
 	}
 }
 
 static int
 spa_check_for_missing_logs(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * If we're doing a normal import, then build up any additional
 	 * diagnostic information about missing log devices.
 	 * We'll pass this up to the user for further processing.
 	 */
 	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
 		nvlist_t **child, *nv;
 		uint64_t idx = 0;
 
 		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *),
 		    KM_SLEEP);
 		nv = fnvlist_alloc();
 
 		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
 
 			/*
 			 * We consider a device as missing only if it failed
 			 * to open (i.e. offline or faulted is not considered
 			 * as missing).
 			 */
 			if (tvd->vdev_islog &&
 			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
 				child[idx++] = vdev_config_generate(spa, tvd,
 				    B_FALSE, VDEV_CONFIG_MISSING);
 			}
 		}
 
 		if (idx > 0) {
 			fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 			    (const nvlist_t * const *)child, idx);
 			fnvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_MISSING_DEVICES, nv);
 
 			for (uint64_t i = 0; i < idx; i++)
 				nvlist_free(child[i]);
 		}
 		nvlist_free(nv);
 		kmem_free(child, rvd->vdev_children * sizeof (char **));
 
 		if (idx > 0) {
 			spa_load_failed(spa, "some log devices are missing");
 			vdev_dbgmsg_print_tree(rvd, 2);
 			return (SET_ERROR(ENXIO));
 		}
 	} else {
 		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
 
 			if (tvd->vdev_islog &&
 			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
 				spa_set_log_state(spa, SPA_LOG_CLEAR);
 				spa_load_note(spa, "some log devices are "
 				    "missing, ZIL is dropped.");
 				vdev_dbgmsg_print_tree(rvd, 2);
 				break;
 			}
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Check for missing log devices
  */
 static boolean_t
 spa_check_logs(spa_t *spa)
 {
 	boolean_t rv = B_FALSE;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 
 	switch (spa->spa_log_state) {
 	default:
 		break;
 	case SPA_LOG_MISSING:
 		/* need to recheck in case slog has been restored */
 	case SPA_LOG_UNKNOWN:
 		rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 		    zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
 		if (rv)
 			spa_set_log_state(spa, SPA_LOG_MISSING);
 		break;
 	}
 	return (rv);
 }
 
 /*
  * Passivate any log vdevs (note, does not apply to embedded log metaslabs).
  */
 static boolean_t
 spa_passivate_log(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	boolean_t slog_found = B_FALSE;
 
 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 
 		if (tvd->vdev_islog) {
 			ASSERT3P(tvd->vdev_log_mg, ==, NULL);
 			metaslab_group_passivate(tvd->vdev_mg);
 			slog_found = B_TRUE;
 		}
 	}
 
 	return (slog_found);
 }
 
 /*
  * Activate any log vdevs (note, does not apply to embedded log metaslabs).
  */
 static void
 spa_activate_log(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 
 		if (tvd->vdev_islog) {
 			ASSERT3P(tvd->vdev_log_mg, ==, NULL);
 			metaslab_group_activate(tvd->vdev_mg);
 		}
 	}
 }
 
 int
 spa_reset_logs(spa_t *spa)
 {
 	int error;
 
 	error = dmu_objset_find(spa_name(spa), zil_reset,
 	    NULL, DS_FIND_CHILDREN);
 	if (error == 0) {
 		/*
 		 * We successfully offlined the log device, sync out the
 		 * current txg so that the "stubby" block can be removed
 		 * by zil_sync().
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 	}
 	return (error);
 }
 
 static void
 spa_aux_check_removed(spa_aux_vdev_t *sav)
 {
 	for (int i = 0; i < sav->sav_count; i++)
 		spa_check_removed(sav->sav_vdevs[i]);
 }
 
 void
 spa_claim_notify(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 
 	if (zio->io_error)
 		return;
 
 	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
-	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
-		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
+	if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp))
+		spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp);
 	mutex_exit(&spa->spa_props_lock);
 }
 
 typedef struct spa_load_error {
 	boolean_t	sle_verify_data;
 	uint64_t	sle_meta_count;
 	uint64_t	sle_data_count;
 } spa_load_error_t;
 
 static void
 spa_load_verify_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	spa_load_error_t *sle = zio->io_private;
 	dmu_object_type_t type = BP_GET_TYPE(bp);
 	int error = zio->io_error;
 	spa_t *spa = zio->io_spa;
 
 	abd_free(zio->io_abd);
 	if (error) {
 		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
 		    type != DMU_OT_INTENT_LOG)
 			atomic_inc_64(&sle->sle_meta_count);
 		else
 			atomic_inc_64(&sle->sle_data_count);
 	}
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
 	cv_broadcast(&spa->spa_scrub_io_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 /*
  * Maximum number of inflight bytes is the log2 fraction of the arc size.
  * By default, we set it to 1/16th of the arc.
  */
 static uint_t spa_load_verify_shift = 4;
 static int spa_load_verify_metadata = B_TRUE;
 static int spa_load_verify_data = B_TRUE;
 
 static int
 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	zio_t *rio = arg;
 	spa_load_error_t *sle = rio->io_private;
 
 	(void) zilog, (void) dnp;
 
 	/*
 	 * Note: normally this routine will not be called if
 	 * spa_load_verify_metadata is not set.  However, it may be useful
 	 * to manually set the flag after the traversal has begun.
 	 */
 	if (!spa_load_verify_metadata)
 		return (0);
 
 	/*
 	 * Sanity check the block pointer in order to detect obvious damage
 	 * before using the contents in subsequent checks or in zio_read().
 	 * When damaged consider it to be a metadata error since we cannot
 	 * trust the BP_GET_TYPE and BP_GET_LEVEL values.
 	 */
 	if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
 		atomic_inc_64(&sle->sle_meta_count);
 		return (0);
 	}
 
 	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
 	    BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
 		return (0);
 
 	if (!BP_IS_METADATA(bp) &&
 	    (!spa_load_verify_data || !sle->sle_verify_data))
 		return (0);
 
 	uint64_t maxinflight_bytes =
 	    arc_target_bytes() >> spa_load_verify_shift;
 	size_t size = BP_GET_PSIZE(bp);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	while (spa->spa_load_verify_bytes >= maxinflight_bytes)
 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 	spa->spa_load_verify_bytes += size;
 	mutex_exit(&spa->spa_scrub_lock);
 
 	zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
 	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
 	return (0);
 }
 
 static int
 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	(void) dp, (void) arg;
 
 	if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
 		return (SET_ERROR(ENAMETOOLONG));
 
 	return (0);
 }
 
 static int
 spa_load_verify(spa_t *spa)
 {
 	zio_t *rio;
 	spa_load_error_t sle = { 0 };
 	zpool_load_policy_t policy;
 	boolean_t verify_ok = B_FALSE;
 	int error = 0;
 
 	zpool_get_load_policy(spa->spa_config, &policy);
 
 	if (policy.zlp_rewind & ZPOOL_NEVER_REWIND ||
 	    policy.zlp_maxmeta == UINT64_MAX)
 		return (0);
 
 	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
 	error = dmu_objset_find_dp(spa->spa_dsl_pool,
 	    spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
 	    DS_FIND_CHILDREN);
 	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Verify data only if we are rewinding or error limit was set.
 	 * Otherwise nothing except dbgmsg care about it to waste time.
 	 */
 	sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) ||
 	    (policy.zlp_maxdata < UINT64_MAX);
 
 	rio = zio_root(spa, NULL, &sle,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
 
 	if (spa_load_verify_metadata) {
 		if (spa->spa_extreme_rewind) {
 			spa_load_note(spa, "performing a complete scan of the "
 			    "pool since extreme rewind is on. This may take "
 			    "a very long time.\n  (spa_load_verify_data=%u, "
 			    "spa_load_verify_metadata=%u)",
 			    spa_load_verify_data, spa_load_verify_metadata);
 		}
 
 		error = traverse_pool(spa, spa->spa_verify_min_txg,
 		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
 		    TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio);
 	}
 
 	(void) zio_wait(rio);
 	ASSERT0(spa->spa_load_verify_bytes);
 
 	spa->spa_load_meta_errors = sle.sle_meta_count;
 	spa->spa_load_data_errors = sle.sle_data_count;
 
 	if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
 		spa_load_note(spa, "spa_load_verify found %llu metadata errors "
 		    "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
 		    (u_longlong_t)sle.sle_data_count);
 	}
 
 	if (spa_load_verify_dryrun ||
 	    (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
 	    sle.sle_data_count <= policy.zlp_maxdata)) {
 		int64_t loss = 0;
 
 		verify_ok = B_TRUE;
 		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
 		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
 
 		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
 		fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME,
 		    spa->spa_load_txg_ts);
 		fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME,
 		    loss);
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count);
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count);
 	} else {
 		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
 	}
 
 	if (spa_load_verify_dryrun)
 		return (0);
 
 	if (error) {
 		if (error != ENXIO && error != EIO)
 			error = SET_ERROR(EIO);
 		return (error);
 	}
 
 	return (verify_ok ? 0 : EIO);
 }
 
 /*
  * Find a value in the pool props object.
  */
 static void
 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
 {
 	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
 	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
 }
 
 /*
  * Find a value in the pool directory object.
  */
 static int
 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
 {
 	int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    name, sizeof (uint64_t), 1, val);
 
 	if (error != 0 && (error != ENOENT || log_enoent)) {
 		spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
 		    "[error=%d]", name, error);
 	}
 
 	return (error);
 }
 
 static int
 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
 {
 	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
 	return (SET_ERROR(err));
 }
 
 boolean_t
 spa_livelist_delete_check(spa_t *spa)
 {
 	return (spa->spa_livelists_to_delete != 0);
 }
 
 static boolean_t
 spa_livelist_delete_cb_check(void *arg, zthr_t *z)
 {
 	(void) z;
 	spa_t *spa = arg;
 	return (spa_livelist_delete_check(spa));
 }
 
 static int
 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	spa_t *spa = arg;
 	zio_free(spa, tx->tx_txg, bp);
 	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
 	    -bp_get_dsize_sync(spa, bp),
 	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
 	return (0);
 }
 
 static int
 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp)
 {
 	int err;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	zap_cursor_init(&zc, os, zap_obj);
 	err = zap_cursor_retrieve(&zc, &za);
 	zap_cursor_fini(&zc);
 	if (err == 0)
 		*llp = za.za_first_integer;
 	return (err);
 }
 
 /*
  * Components of livelist deletion that must be performed in syncing
  * context: freeing block pointers and updating the pool-wide data
  * structures to indicate how much work is left to do
  */
 typedef struct sublist_delete_arg {
 	spa_t *spa;
 	dsl_deadlist_t *ll;
 	uint64_t key;
 	bplist_t *to_free;
 } sublist_delete_arg_t;
 
 static void
 sublist_delete_sync(void *arg, dmu_tx_t *tx)
 {
 	sublist_delete_arg_t *sda = arg;
 	spa_t *spa = sda->spa;
 	dsl_deadlist_t *ll = sda->ll;
 	uint64_t key = sda->key;
 	bplist_t *to_free = sda->to_free;
 
 	bplist_iterate(to_free, delete_blkptr_cb, spa, tx);
 	dsl_deadlist_remove_entry(ll, key, tx);
 }
 
 typedef struct livelist_delete_arg {
 	spa_t *spa;
 	uint64_t ll_obj;
 	uint64_t zap_obj;
 } livelist_delete_arg_t;
 
 static void
 livelist_delete_sync(void *arg, dmu_tx_t *tx)
 {
 	livelist_delete_arg_t *lda = arg;
 	spa_t *spa = lda->spa;
 	uint64_t ll_obj = lda->ll_obj;
 	uint64_t zap_obj = lda->zap_obj;
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t count;
 
 	/* free the livelist and decrement the feature count */
 	VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx));
 	dsl_deadlist_free(mos, ll_obj, tx);
 	spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
 	VERIFY0(zap_count(mos, zap_obj, &count));
 	if (count == 0) {
 		/* no more livelists to delete */
 		VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_DELETED_CLONES, tx));
 		VERIFY0(zap_destroy(mos, zap_obj, tx));
 		spa->spa_livelists_to_delete = 0;
 		spa_notify_waiters(spa);
 	}
 }
 
 /*
  * Load in the value for the livelist to be removed and open it. Then,
  * load its first sublist and determine which block pointers should actually
  * be freed. Then, call a synctask which performs the actual frees and updates
  * the pool-wide livelist data.
  */
 static void
 spa_livelist_delete_cb(void *arg, zthr_t *z)
 {
 	spa_t *spa = arg;
 	uint64_t ll_obj = 0, count;
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t zap_obj = spa->spa_livelists_to_delete;
 	/*
 	 * Determine the next livelist to delete. This function should only
 	 * be called if there is at least one deleted clone.
 	 */
 	VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj));
 	VERIFY0(zap_count(mos, ll_obj, &count));
 	if (count > 0) {
 		dsl_deadlist_t *ll;
 		dsl_deadlist_entry_t *dle;
 		bplist_t to_free;
 		ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP);
 		dsl_deadlist_open(ll, mos, ll_obj);
 		dle = dsl_deadlist_first(ll);
 		ASSERT3P(dle, !=, NULL);
 		bplist_create(&to_free);
 		int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free,
 		    z, NULL);
 		if (err == 0) {
 			sublist_delete_arg_t sync_arg = {
 			    .spa = spa,
 			    .ll = ll,
 			    .key = dle->dle_mintxg,
 			    .to_free = &to_free
 			};
 			zfs_dbgmsg("deleting sublist (id %llu) from"
 			    " livelist %llu, %lld remaining",
 			    (u_longlong_t)dle->dle_bpobj.bpo_object,
 			    (u_longlong_t)ll_obj, (longlong_t)count - 1);
 			VERIFY0(dsl_sync_task(spa_name(spa), NULL,
 			    sublist_delete_sync, &sync_arg, 0,
 			    ZFS_SPACE_CHECK_DESTROY));
 		} else {
 			VERIFY3U(err, ==, EINTR);
 		}
 		bplist_clear(&to_free);
 		bplist_destroy(&to_free);
 		dsl_deadlist_close(ll);
 		kmem_free(ll, sizeof (dsl_deadlist_t));
 	} else {
 		livelist_delete_arg_t sync_arg = {
 		    .spa = spa,
 		    .ll_obj = ll_obj,
 		    .zap_obj = zap_obj
 		};
 		zfs_dbgmsg("deletion of livelist %llu completed",
 		    (u_longlong_t)ll_obj);
 		VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync,
 		    &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY));
 	}
 }
 
 static void
 spa_start_livelist_destroy_thread(spa_t *spa)
 {
 	ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL);
 	spa->spa_livelist_delete_zthr =
 	    zthr_create("z_livelist_destroy",
 	    spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa,
 	    minclsyspri);
 }
 
 typedef struct livelist_new_arg {
 	bplist_t *allocs;
 	bplist_t *frees;
 } livelist_new_arg_t;
 
 static int
 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT(tx == NULL);
 	livelist_new_arg_t *lna = arg;
 	if (bp_freed) {
 		bplist_append(lna->frees, bp);
 	} else {
 		bplist_append(lna->allocs, bp);
 		zfs_livelist_condense_new_alloc++;
 	}
 	return (0);
 }
 
 typedef struct livelist_condense_arg {
 	spa_t *spa;
 	bplist_t to_keep;
 	uint64_t first_size;
 	uint64_t next_size;
 } livelist_condense_arg_t;
 
 static void
 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx)
 {
 	livelist_condense_arg_t *lca = arg;
 	spa_t *spa = lca->spa;
 	bplist_t new_frees;
 	dsl_dataset_t *ds = spa->spa_to_condense.ds;
 
 	/* Have we been cancelled? */
 	if (spa->spa_to_condense.cancelled) {
 		zfs_livelist_condense_sync_cancel++;
 		goto out;
 	}
 
 	dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
 	dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
 	dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
 
 	/*
 	 * It's possible that the livelist was changed while the zthr was
 	 * running. Therefore, we need to check for new blkptrs in the two
 	 * entries being condensed and continue to track them in the livelist.
 	 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl),
 	 * it's possible that the newly added blkptrs are FREEs or ALLOCs so
 	 * we need to sort them into two different bplists.
 	 */
 	uint64_t first_obj = first->dle_bpobj.bpo_object;
 	uint64_t next_obj = next->dle_bpobj.bpo_object;
 	uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs;
 	uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
 
 	bplist_create(&new_frees);
 	livelist_new_arg_t new_bps = {
 	    .allocs = &lca->to_keep,
 	    .frees = &new_frees,
 	};
 
 	if (cur_first_size > lca->first_size) {
 		VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj,
 		    livelist_track_new_cb, &new_bps, lca->first_size));
 	}
 	if (cur_next_size > lca->next_size) {
 		VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj,
 		    livelist_track_new_cb, &new_bps, lca->next_size));
 	}
 
 	dsl_deadlist_clear_entry(first, ll, tx);
 	ASSERT(bpobj_is_empty(&first->dle_bpobj));
 	dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx);
 
 	bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx);
 	bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx);
 	bplist_destroy(&new_frees);
 
 	char dsname[ZFS_MAX_DATASET_NAME_LEN];
 	dsl_dataset_name(ds, dsname);
 	zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu "
 	    "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu "
 	    "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname,
 	    (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj,
 	    (u_longlong_t)cur_first_size, (u_longlong_t)next_obj,
 	    (u_longlong_t)cur_next_size,
 	    (u_longlong_t)first->dle_bpobj.bpo_object,
 	    (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs);
 out:
 	dmu_buf_rele(ds->ds_dbuf, spa);
 	spa->spa_to_condense.ds = NULL;
 	bplist_clear(&lca->to_keep);
 	bplist_destroy(&lca->to_keep);
 	kmem_free(lca, sizeof (livelist_condense_arg_t));
 	spa->spa_to_condense.syncing = B_FALSE;
 }
 
 static void
 spa_livelist_condense_cb(void *arg, zthr_t *t)
 {
 	while (zfs_livelist_condense_zthr_pause &&
 	    !(zthr_has_waiters(t) || zthr_iscancelled(t)))
 		delay(1);
 
 	spa_t *spa = arg;
 	dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
 	dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
 	uint64_t first_size, next_size;
 
 	livelist_condense_arg_t *lca =
 	    kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP);
 	bplist_create(&lca->to_keep);
 
 	/*
 	 * Process the livelists (matching FREEs and ALLOCs) in open context
 	 * so we have minimal work in syncing context to condense.
 	 *
 	 * We save bpobj sizes (first_size and next_size) to use later in
 	 * syncing context to determine if entries were added to these sublists
 	 * while in open context. This is possible because the clone is still
 	 * active and open for normal writes and we want to make sure the new,
 	 * unprocessed blockpointers are inserted into the livelist normally.
 	 *
 	 * Note that dsl_process_sub_livelist() both stores the size number of
 	 * blockpointers and iterates over them while the bpobj's lock held, so
 	 * the sizes returned to us are consistent which what was actually
 	 * processed.
 	 */
 	int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t,
 	    &first_size);
 	if (err == 0)
 		err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep,
 		    t, &next_size);
 
 	if (err == 0) {
 		while (zfs_livelist_condense_sync_pause &&
 		    !(zthr_has_waiters(t) || zthr_iscancelled(t)))
 			delay(1);
 
 		dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 		dmu_tx_mark_netfree(tx);
 		dmu_tx_hold_space(tx, 1);
 		err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE);
 		if (err == 0) {
 			/*
 			 * Prevent the condense zthr restarting before
 			 * the synctask completes.
 			 */
 			spa->spa_to_condense.syncing = B_TRUE;
 			lca->spa = spa;
 			lca->first_size = first_size;
 			lca->next_size = next_size;
 			dsl_sync_task_nowait(spa_get_dsl(spa),
 			    spa_livelist_condense_sync, lca, tx);
 			dmu_tx_commit(tx);
 			return;
 		}
 	}
 	/*
 	 * Condensing can not continue: either it was externally stopped or
 	 * we were unable to assign to a tx because the pool has run out of
 	 * space. In the second case, we'll just end up trying to condense
 	 * again in a later txg.
 	 */
 	ASSERT(err != 0);
 	bplist_clear(&lca->to_keep);
 	bplist_destroy(&lca->to_keep);
 	kmem_free(lca, sizeof (livelist_condense_arg_t));
 	dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa);
 	spa->spa_to_condense.ds = NULL;
 	if (err == EINTR)
 		zfs_livelist_condense_zthr_cancel++;
 }
 
 /*
  * Check that there is something to condense but that a condense is not
  * already in progress and that condensing has not been cancelled.
  */
 static boolean_t
 spa_livelist_condense_cb_check(void *arg, zthr_t *z)
 {
 	(void) z;
 	spa_t *spa = arg;
 	if ((spa->spa_to_condense.ds != NULL) &&
 	    (spa->spa_to_condense.syncing == B_FALSE) &&
 	    (spa->spa_to_condense.cancelled == B_FALSE)) {
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static void
 spa_start_livelist_condensing_thread(spa_t *spa)
 {
 	spa->spa_to_condense.ds = NULL;
 	spa->spa_to_condense.first = NULL;
 	spa->spa_to_condense.next = NULL;
 	spa->spa_to_condense.syncing = B_FALSE;
 	spa->spa_to_condense.cancelled = B_FALSE;
 
 	ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL);
 	spa->spa_livelist_condense_zthr =
 	    zthr_create("z_livelist_condense",
 	    spa_livelist_condense_cb_check,
 	    spa_livelist_condense_cb, spa, minclsyspri);
 }
 
 static void
 spa_spawn_aux_threads(spa_t *spa)
 {
 	ASSERT(spa_writeable(spa));
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa_start_raidz_expansion_thread(spa);
 	spa_start_indirect_condensing_thread(spa);
 	spa_start_livelist_destroy_thread(spa);
 	spa_start_livelist_condensing_thread(spa);
 
 	ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
 	spa->spa_checkpoint_discard_zthr =
 	    zthr_create("z_checkpoint_discard",
 	    spa_checkpoint_discard_thread_check,
 	    spa_checkpoint_discard_thread, spa, minclsyspri);
 }
 
 /*
  * Fix up config after a partly-completed split.  This is done with the
  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
  * pool have that entry in their config, but only the splitting one contains
  * a list of all the guids of the vdevs that are being split off.
  *
  * This function determines what to do with that list: either rejoin
  * all the disks to the pool, or complete the splitting process.  To attempt
  * the rejoin, each disk that is offlined is marked online again, and
  * we do a reopen() call.  If the vdev label for every disk that was
  * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
  * then we call vdev_split() on each disk, and complete the split.
  *
  * Otherwise we leave the config alone, with all the vdevs in place in
  * the original pool.
  */
 static void
 spa_try_repair(spa_t *spa, nvlist_t *config)
 {
 	uint_t extracted;
 	uint64_t *glist;
 	uint_t i, gcount;
 	nvlist_t *nvl;
 	vdev_t **vd;
 	boolean_t attempt_reopen;
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
 		return;
 
 	/* check that the config is complete */
 	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
 	    &glist, &gcount) != 0)
 		return;
 
 	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
 
 	/* attempt to online all the vdevs & validate */
 	attempt_reopen = B_TRUE;
 	for (i = 0; i < gcount; i++) {
 		if (glist[i] == 0)	/* vdev is hole */
 			continue;
 
 		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
 		if (vd[i] == NULL) {
 			/*
 			 * Don't bother attempting to reopen the disks;
 			 * just do the split.
 			 */
 			attempt_reopen = B_FALSE;
 		} else {
 			/* attempt to re-online it */
 			vd[i]->vdev_offline = B_FALSE;
 		}
 	}
 
 	if (attempt_reopen) {
 		vdev_reopen(spa->spa_root_vdev);
 
 		/* check each device to see what state it's in */
 		for (extracted = 0, i = 0; i < gcount; i++) {
 			if (vd[i] != NULL &&
 			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
 				break;
 			++extracted;
 		}
 	}
 
 	/*
 	 * If every disk has been moved to the new pool, or if we never
 	 * even attempted to look at them, then we split them off for
 	 * good.
 	 */
 	if (!attempt_reopen || gcount == extracted) {
 		for (i = 0; i < gcount; i++)
 			if (vd[i] != NULL)
 				vdev_split(vd[i]);
 		vdev_reopen(spa->spa_root_vdev);
 	}
 
 	kmem_free(vd, gcount * sizeof (vdev_t *));
 }
 
 static int
 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
 {
 	const char *ereport = FM_EREPORT_ZFS_POOL;
 	int error;
 
 	spa->spa_load_state = state;
 	(void) spa_import_progress_set_state(spa_guid(spa),
 	    spa_load_state(spa));
 	spa_import_progress_set_notes(spa, "spa_load()");
 
 	gethrestime(&spa->spa_loaded_ts);
 	error = spa_load_impl(spa, type, &ereport);
 
 	/*
 	 * Don't count references from objsets that are already closed
 	 * and are making their way through the eviction process.
 	 */
 	spa_evicting_os_wait(spa);
 	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
 	if (error) {
 		if (error != EEXIST) {
 			spa->spa_loaded_ts.tv_sec = 0;
 			spa->spa_loaded_ts.tv_nsec = 0;
 		}
 		if (error != EBADF) {
 			(void) zfs_ereport_post(ereport, spa,
 			    NULL, NULL, NULL, 0);
 		}
 	}
 	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
 	spa->spa_ena = 0;
 
 	(void) spa_import_progress_set_state(spa_guid(spa),
 	    spa_load_state(spa));
 
 	return (error);
 }
 
 #ifdef ZFS_DEBUG
 /*
  * Count the number of per-vdev ZAPs associated with all of the vdevs in the
  * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
  * spa's per-vdev ZAP list.
  */
 static uint64_t
 vdev_count_verify_zaps(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t total = 0;
 
 	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) &&
 	    vd->vdev_root_zap != 0) {
 		total++;
 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, vd->vdev_root_zap));
 	}
 	if (vd->vdev_top_zap != 0) {
 		total++;
 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, vd->vdev_top_zap));
 	}
 	if (vd->vdev_leaf_zap != 0) {
 		total++;
 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
 	}
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		total += vdev_count_verify_zaps(vd->vdev_child[i]);
 	}
 
 	return (total);
 }
 #else
 #define	vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0)
 #endif
 
 /*
  * Determine whether the activity check is required.
  */
 static boolean_t
 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
     nvlist_t *config)
 {
 	uint64_t state = 0;
 	uint64_t hostid = 0;
 	uint64_t tryconfig_txg = 0;
 	uint64_t tryconfig_timestamp = 0;
 	uint16_t tryconfig_mmp_seq = 0;
 	nvlist_t *nvinfo;
 
 	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
 		nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
 		(void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
 		    &tryconfig_txg);
 		(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
 		    &tryconfig_timestamp);
 		(void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
 		    &tryconfig_mmp_seq);
 	}
 
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state);
 
 	/*
 	 * Disable the MMP activity check - This is used by zdb which
 	 * is intended to be used on potentially active pools.
 	 */
 	if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP)
 		return (B_FALSE);
 
 	/*
 	 * Skip the activity check when the MMP feature is disabled.
 	 */
 	if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)
 		return (B_FALSE);
 
 	/*
 	 * If the tryconfig_ values are nonzero, they are the results of an
 	 * earlier tryimport.  If they all match the uberblock we just found,
 	 * then the pool has not changed and we return false so we do not test
 	 * a second time.
 	 */
 	if (tryconfig_txg && tryconfig_txg == ub->ub_txg &&
 	    tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp &&
 	    tryconfig_mmp_seq && tryconfig_mmp_seq ==
 	    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0))
 		return (B_FALSE);
 
 	/*
 	 * Allow the activity check to be skipped when importing the pool
 	 * on the same host which last imported it.  Since the hostid from
 	 * configuration may be stale use the one read from the label.
 	 */
 	if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
 		hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
 
 	if (hostid == spa_get_hostid(spa))
 		return (B_FALSE);
 
 	/*
 	 * Skip the activity test when the pool was cleanly exported.
 	 */
 	if (state != POOL_STATE_ACTIVE)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 /*
  * Nanoseconds the activity check must watch for changes on-disk.
  */
 static uint64_t
 spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
 {
 	uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
 	uint64_t multihost_interval = MSEC2NSEC(
 	    MMP_INTERVAL_OK(zfs_multihost_interval));
 	uint64_t import_delay = MAX(NANOSEC, import_intervals *
 	    multihost_interval);
 
 	/*
 	 * Local tunables determine a minimum duration except for the case
 	 * where we know when the remote host will suspend the pool if MMP
 	 * writes do not land.
 	 *
 	 * See Big Theory comment at the top of mmp.c for the reasoning behind
 	 * these cases and times.
 	 */
 
 	ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100);
 
 	if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
 	    MMP_FAIL_INT(ub) > 0) {
 
 		/* MMP on remote host will suspend pool after failed writes */
 		import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
 		    MMP_IMPORT_SAFETY_FACTOR / 100;
 
 		zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp "
 		    "mmp_fails=%llu ub_mmp mmp_interval=%llu "
 		    "import_intervals=%llu", (u_longlong_t)import_delay,
 		    (u_longlong_t)MMP_FAIL_INT(ub),
 		    (u_longlong_t)MMP_INTERVAL(ub),
 		    (u_longlong_t)import_intervals);
 
 	} else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
 	    MMP_FAIL_INT(ub) == 0) {
 
 		/* MMP on remote host will never suspend pool */
 		import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
 		    ub->ub_mmp_delay) * import_intervals);
 
 		zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp "
 		    "mmp_interval=%llu ub_mmp_delay=%llu "
 		    "import_intervals=%llu", (u_longlong_t)import_delay,
 		    (u_longlong_t)MMP_INTERVAL(ub),
 		    (u_longlong_t)ub->ub_mmp_delay,
 		    (u_longlong_t)import_intervals);
 
 	} else if (MMP_VALID(ub)) {
 		/*
 		 * zfs-0.7 compatibility case
 		 */
 
 		import_delay = MAX(import_delay, (multihost_interval +
 		    ub->ub_mmp_delay) * import_intervals);
 
 		zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu "
 		    "import_intervals=%llu leaves=%u",
 		    (u_longlong_t)import_delay,
 		    (u_longlong_t)ub->ub_mmp_delay,
 		    (u_longlong_t)import_intervals,
 		    vdev_count_leaves(spa));
 	} else {
 		/* Using local tunings is the only reasonable option */
 		zfs_dbgmsg("pool last imported on non-MMP aware "
 		    "host using import_delay=%llu multihost_interval=%llu "
 		    "import_intervals=%llu", (u_longlong_t)import_delay,
 		    (u_longlong_t)multihost_interval,
 		    (u_longlong_t)import_intervals);
 	}
 
 	return (import_delay);
 }
 
 /*
  * Perform the import activity check.  If the user canceled the import or
  * we detected activity then fail.
  */
 static int
 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
 {
 	uint64_t txg = ub->ub_txg;
 	uint64_t timestamp = ub->ub_timestamp;
 	uint64_t mmp_config = ub->ub_mmp_config;
 	uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
 	uint64_t import_delay;
 	hrtime_t import_expire, now;
 	nvlist_t *mmp_label = NULL;
 	vdev_t *rvd = spa->spa_root_vdev;
 	kcondvar_t cv;
 	kmutex_t mtx;
 	int error = 0;
 
 	cv_init(&cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_enter(&mtx);
 
 	/*
 	 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
 	 * during the earlier tryimport.  If the txg recorded there is 0 then
 	 * the pool is known to be active on another host.
 	 *
 	 * Otherwise, the pool might be in use on another host.  Check for
 	 * changes in the uberblocks on disk if necessary.
 	 */
 	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
 		nvlist_t *nvinfo = fnvlist_lookup_nvlist(config,
 		    ZPOOL_CONFIG_LOAD_INFO);
 
 		if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) &&
 		    fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) {
 			vdev_uberblock_load(rvd, ub, &mmp_label);
 			error = SET_ERROR(EREMOTEIO);
 			goto out;
 		}
 	}
 
 	import_delay = spa_activity_check_duration(spa, ub);
 
 	/* Add a small random factor in case of simultaneous imports (0-25%) */
 	import_delay += import_delay * random_in_range(250) / 1000;
 
 	import_expire = gethrtime() + import_delay;
 
 	spa_import_progress_set_notes(spa, "Checking MMP activity, waiting "
 	    "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
 
 	int interations = 0;
 	while ((now = gethrtime()) < import_expire) {
 		if (interations++ % 30 == 0) {
 			spa_import_progress_set_notes(spa, "Checking MMP "
 			    "activity, %llu ms remaining",
 			    (u_longlong_t)NSEC2MSEC(import_expire - now));
 		}
 
 		(void) spa_import_progress_set_mmp_check(spa_guid(spa),
 		    NSEC2SEC(import_expire - gethrtime()));
 
 		vdev_uberblock_load(rvd, ub, &mmp_label);
 
 		if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
 		    mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
 			zfs_dbgmsg("multihost activity detected "
 			    "txg %llu ub_txg  %llu "
 			    "timestamp %llu ub_timestamp  %llu "
 			    "mmp_config %#llx ub_mmp_config %#llx",
 			    (u_longlong_t)txg, (u_longlong_t)ub->ub_txg,
 			    (u_longlong_t)timestamp,
 			    (u_longlong_t)ub->ub_timestamp,
 			    (u_longlong_t)mmp_config,
 			    (u_longlong_t)ub->ub_mmp_config);
 
 			error = SET_ERROR(EREMOTEIO);
 			break;
 		}
 
 		if (mmp_label) {
 			nvlist_free(mmp_label);
 			mmp_label = NULL;
 		}
 
 		error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz);
 		if (error != -1) {
 			error = SET_ERROR(EINTR);
 			break;
 		}
 		error = 0;
 	}
 
 out:
 	mutex_exit(&mtx);
 	mutex_destroy(&mtx);
 	cv_destroy(&cv);
 
 	/*
 	 * If the pool is determined to be active store the status in the
 	 * spa->spa_load_info nvlist.  If the remote hostname or hostid are
 	 * available from configuration read from disk store them as well.
 	 * This allows 'zpool import' to generate a more useful message.
 	 *
 	 * ZPOOL_CONFIG_MMP_STATE    - observed pool status (mandatory)
 	 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool
 	 * ZPOOL_CONFIG_MMP_HOSTID   - hostid from the active pool
 	 */
 	if (error == EREMOTEIO) {
 		const char *hostname = "<unknown>";
 		uint64_t hostid = 0;
 
 		if (mmp_label) {
 			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
 				hostname = fnvlist_lookup_string(mmp_label,
 				    ZPOOL_CONFIG_HOSTNAME);
 				fnvlist_add_string(spa->spa_load_info,
 				    ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
 			}
 
 			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
 				hostid = fnvlist_lookup_uint64(mmp_label,
 				    ZPOOL_CONFIG_HOSTID);
 				fnvlist_add_uint64(spa->spa_load_info,
 				    ZPOOL_CONFIG_MMP_HOSTID, hostid);
 			}
 		}
 
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE);
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_TXG, 0);
 
 		error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO);
 	}
 
 	if (mmp_label)
 		nvlist_free(mmp_label);
 
 	return (error);
 }
 
 static int
 spa_verify_host(spa_t *spa, nvlist_t *mos_config)
 {
 	uint64_t hostid;
 	const char *hostname;
 	uint64_t myhostid = 0;
 
 	if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
 	    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
 		hostname = fnvlist_lookup_string(mos_config,
 		    ZPOOL_CONFIG_HOSTNAME);
 
 		myhostid = zone_get_hostid(NULL);
 
 		if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
 			cmn_err(CE_WARN, "pool '%s' could not be "
 			    "loaded as it was last accessed by "
 			    "another system (host: %s hostid: 0x%llx). "
 			    "See: https://openzfs.github.io/openzfs-docs/msg/"
 			    "ZFS-8000-EY",
 			    spa_name(spa), hostname, (u_longlong_t)hostid);
 			spa_load_failed(spa, "hostid verification failed: pool "
 			    "last accessed by host: %s (hostid: 0x%llx)",
 			    hostname, (u_longlong_t)hostid);
 			return (SET_ERROR(EBADF));
 		}
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
 {
 	int error = 0;
 	nvlist_t *nvtree, *nvl, *config = spa->spa_config;
 	int parse;
 	vdev_t *rvd;
 	uint64_t pool_guid;
 	const char *comment;
 	const char *compatibility;
 
 	/*
 	 * Versioning wasn't explicitly added to the label until later, so if
 	 * it's not present treat it as the initial version.
 	 */
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 	    &spa->spa_ubsync.ub_version) != 0)
 		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
 
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
 		spa_load_failed(spa, "invalid config provided: '%s' missing",
 		    ZPOOL_CONFIG_POOL_GUID);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * If we are doing an import, ensure that the pool is not already
 	 * imported by checking if its pool guid already exists in the
 	 * spa namespace.
 	 *
 	 * The only case that we allow an already imported pool to be
 	 * imported again, is when the pool is checkpointed and we want to
 	 * look at its checkpointed state from userland tools like zdb.
 	 */
 #ifdef _KERNEL
 	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
 	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
 	    spa_guid_exists(pool_guid, 0)) {
 #else
 	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
 	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
 	    spa_guid_exists(pool_guid, 0) &&
 	    !spa_importing_readonly_checkpoint(spa)) {
 #endif
 		spa_load_failed(spa, "a pool with guid %llu is already open",
 		    (u_longlong_t)pool_guid);
 		return (SET_ERROR(EEXIST));
 	}
 
 	spa->spa_config_guid = pool_guid;
 
 	nvlist_free(spa->spa_load_info);
 	spa->spa_load_info = fnvlist_alloc();
 
 	ASSERT(spa->spa_comment == NULL);
 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
 		spa->spa_comment = spa_strdup(comment);
 
 	ASSERT(spa->spa_compatibility == NULL);
 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY,
 	    &compatibility) == 0)
 		spa->spa_compatibility = spa_strdup(compatibility);
 
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    &spa->spa_config_txg);
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
 		spa->spa_config_splitting = fnvlist_dup(nvl);
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
 		spa_load_failed(spa, "invalid config provided: '%s' missing",
 		    ZPOOL_CONFIG_VDEV_TREE);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
 	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
 	    KM_SLEEP);
 	for (int i = 0; i < max_ncpus; i++) {
 		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 	}
 
 	/*
 	 * Parse the configuration into a vdev tree.  We explicitly set the
 	 * value that will be returned by spa_version() since parsing the
 	 * configuration requires knowing the version number.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	parse = (type == SPA_IMPORT_EXISTING ?
 	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
 	error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0) {
 		spa_load_failed(spa, "unable to parse config [error=%d]",
 		    error);
 		return (error);
 	}
 
 	ASSERT(spa->spa_root_vdev == rvd);
 	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
 	ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
 
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_guid(spa) == pool_guid);
 	}
 
 	return (0);
 }
 
 /*
  * Recursively open all vdevs in the vdev tree. This function is called twice:
  * first with the untrusted config, then with the trusted config.
  */
 static int
 spa_ld_open_vdevs(spa_t *spa)
 {
 	int error = 0;
 
 	/*
 	 * spa_missing_tvds_allowed defines how many top-level vdevs can be
 	 * missing/unopenable for the root vdev to be still considered openable.
 	 */
 	if (spa->spa_trust_config) {
 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
 	} else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
 	} else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
 	} else {
 		spa->spa_missing_tvds_allowed = 0;
 	}
 
 	spa->spa_missing_tvds_allowed =
 	    MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = vdev_open(spa->spa_root_vdev);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (spa->spa_missing_tvds != 0) {
 		spa_load_note(spa, "vdev tree has %lld missing top-level "
 		    "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
 		if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) {
 			/*
 			 * Although theoretically we could allow users to open
 			 * incomplete pools in RW mode, we'd need to add a lot
 			 * of extra logic (e.g. adjust pool space to account
 			 * for missing vdevs).
 			 * This limitation also prevents users from accidentally
 			 * opening the pool in RW mode during data recovery and
 			 * damaging it further.
 			 */
 			spa_load_note(spa, "pools with missing top-level "
 			    "vdevs can only be opened in read-only mode.");
 			error = SET_ERROR(ENXIO);
 		} else {
 			spa_load_note(spa, "current settings allow for maximum "
 			    "%lld missing top-level vdevs at this stage.",
 			    (u_longlong_t)spa->spa_missing_tvds_allowed);
 		}
 	}
 	if (error != 0) {
 		spa_load_failed(spa, "unable to open vdev tree [error=%d]",
 		    error);
 	}
 	if (spa->spa_missing_tvds != 0 || error != 0)
 		vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
 
 	return (error);
 }
 
 /*
  * We need to validate the vdev labels against the configuration that
  * we have in hand. This function is called twice: first with an untrusted
  * config, then with a trusted config. The validation is more strict when the
  * config is trusted.
  */
 static int
 spa_ld_validate_vdevs(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = vdev_validate(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0) {
 		spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
 		return (error);
 	}
 
 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
 		spa_load_failed(spa, "cannot open vdev tree after invalidating "
 		    "some vdevs");
 		vdev_dbgmsg_print_tree(rvd, 2);
 		return (SET_ERROR(ENXIO));
 	}
 
 	return (0);
 }
 
 static void
 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
 {
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
 	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
 	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
 	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
 	spa->spa_claim_max_txg = spa->spa_first_txg;
 	spa->spa_prev_software_version = ub->ub_software_version;
 }
 
 static int
 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	nvlist_t *label;
 	uberblock_t *ub = &spa->spa_uberblock;
 	boolean_t activity_check = B_FALSE;
 
 	/*
 	 * If we are opening the checkpointed state of the pool by
 	 * rewinding to it, at this point we will have written the
 	 * checkpointed uberblock to the vdev labels, so searching
 	 * the labels will find the right uberblock.  However, if
 	 * we are opening the checkpointed state read-only, we have
 	 * not modified the labels. Therefore, we must ignore the
 	 * labels and continue using the spa_uberblock that was set
 	 * by spa_ld_checkpoint_rewind.
 	 *
 	 * Note that it would be fine to ignore the labels when
 	 * rewinding (opening writeable) as well. However, if we
 	 * crash just after writing the labels, we will end up
 	 * searching the labels. Doing so in the common case means
 	 * that this code path gets exercised normally, rather than
 	 * just in the edge case.
 	 */
 	if (ub->ub_checkpoint_txg != 0 &&
 	    spa_importing_readonly_checkpoint(spa)) {
 		spa_ld_select_uberblock_done(spa, ub);
 		return (0);
 	}
 
 	/*
 	 * Find the best uberblock.
 	 */
 	vdev_uberblock_load(rvd, ub, &label);
 
 	/*
 	 * If we weren't able to find a single valid uberblock, return failure.
 	 */
 	if (ub->ub_txg == 0) {
 		nvlist_free(label);
 		spa_load_failed(spa, "no valid uberblock found");
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
 	}
 
 	if (spa->spa_load_max_txg != UINT64_MAX) {
 		(void) spa_import_progress_set_max_txg(spa_guid(spa),
 		    (u_longlong_t)spa->spa_load_max_txg);
 	}
 	spa_load_note(spa, "using uberblock with txg=%llu",
 	    (u_longlong_t)ub->ub_txg);
 	if (ub->ub_raidz_reflow_info != 0) {
 		spa_load_note(spa, "uberblock raidz_reflow_info: "
 		    "state=%u offset=%llu",
 		    (int)RRSS_GET_STATE(ub),
 		    (u_longlong_t)RRSS_GET_OFFSET(ub));
 	}
 
 
 	/*
 	 * For pools which have the multihost property on determine if the
 	 * pool is truly inactive and can be safely imported.  Prevent
 	 * hosts which don't have a hostid set from importing the pool.
 	 */
 	activity_check = spa_activity_check_required(spa, ub, label,
 	    spa->spa_config);
 	if (activity_check) {
 		if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
 		    spa_get_hostid(spa) == 0) {
 			nvlist_free(label);
 			fnvlist_add_uint64(spa->spa_load_info,
 			    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
 			return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
 		}
 
 		int error = spa_activity_check(spa, ub, spa->spa_config);
 		if (error) {
 			nvlist_free(label);
 			return (error);
 		}
 
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE);
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_TXG, ub->ub_txg);
 		fnvlist_add_uint16(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_SEQ,
 		    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
 	}
 
 	/*
 	 * If the pool has an unsupported version we can't open it.
 	 */
 	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
 		nvlist_free(label);
 		spa_load_failed(spa, "version %llu is not supported",
 		    (u_longlong_t)ub->ub_version);
 		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
 	}
 
 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
 		nvlist_t *features;
 
 		/*
 		 * If we weren't able to find what's necessary for reading the
 		 * MOS in the label, return failure.
 		 */
 		if (label == NULL) {
 			spa_load_failed(spa, "label config unavailable");
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    ENXIO));
 		}
 
 		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
 		    &features) != 0) {
 			nvlist_free(label);
 			spa_load_failed(spa, "invalid label: '%s' missing",
 			    ZPOOL_CONFIG_FEATURES_FOR_READ);
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    ENXIO));
 		}
 
 		/*
 		 * Update our in-core representation with the definitive values
 		 * from the label.
 		 */
 		nvlist_free(spa->spa_label_features);
 		spa->spa_label_features = fnvlist_dup(features);
 	}
 
 	nvlist_free(label);
 
 	/*
 	 * Look through entries in the label nvlist's features_for_read. If
 	 * there is a feature listed there which we don't understand then we
 	 * cannot open a pool.
 	 */
 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
 		nvlist_t *unsup_feat;
 
 		unsup_feat = fnvlist_alloc();
 
 		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
 		    NULL); nvp != NULL;
 		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
 			if (!zfeature_is_supported(nvpair_name(nvp))) {
 				fnvlist_add_string(unsup_feat,
 				    nvpair_name(nvp), "");
 			}
 		}
 
 		if (!nvlist_empty(unsup_feat)) {
 			fnvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
 			nvlist_free(unsup_feat);
 			spa_load_failed(spa, "some features are unsupported");
 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 			    ENOTSUP));
 		}
 
 		nvlist_free(unsup_feat);
 	}
 
 	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_try_repair(spa, spa->spa_config);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		nvlist_free(spa->spa_config_splitting);
 		spa->spa_config_splitting = NULL;
 	}
 
 	/*
 	 * Initialize internal SPA structures.
 	 */
 	spa_ld_select_uberblock_done(spa, ub);
 
 	return (0);
 }
 
 static int
 spa_ld_open_rootbp(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
 	if (error != 0) {
 		spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
 		    "[error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
 
 	return (0);
 }
 
 static int
 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
     boolean_t reloading)
 {
 	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
 	nvlist_t *nv, *mos_config, *policy;
 	int error = 0, copy_error;
 	uint64_t healthy_tvds, healthy_tvds_mos;
 	uint64_t mos_config_txg;
 
 	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
 	    != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * If we're assembling a pool from a split, the config provided is
 	 * already trusted so there is nothing to do.
 	 */
 	if (type == SPA_IMPORT_ASSEMBLE)
 		return (0);
 
 	healthy_tvds = spa_healthy_core_tvds(spa);
 
 	if (load_nvlist(spa, spa->spa_config_object, &mos_config)
 	    != 0) {
 		spa_load_failed(spa, "unable to retrieve MOS config");
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	/*
 	 * If we are doing an open, pool owner wasn't verified yet, thus do
 	 * the verification here.
 	 */
 	if (spa->spa_load_state == SPA_LOAD_OPEN) {
 		error = spa_verify_host(spa, mos_config);
 		if (error != 0) {
 			nvlist_free(mos_config);
 			return (error);
 		}
 	}
 
 	nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
 	 * Build a new vdev tree from the trusted config
 	 */
 	error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD);
 	if (error != 0) {
 		nvlist_free(mos_config);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa_load_failed(spa, "spa_config_parse failed [error=%d]",
 		    error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 	}
 
 	/*
 	 * Vdev paths in the MOS may be obsolete. If the untrusted config was
 	 * obtained by scanning /dev/dsk, then it will have the right vdev
 	 * paths. We update the trusted MOS config with this information.
 	 * We first try to copy the paths with vdev_copy_path_strict, which
 	 * succeeds only when both configs have exactly the same vdev tree.
 	 * If that fails, we fall back to a more flexible method that has a
 	 * best effort policy.
 	 */
 	copy_error = vdev_copy_path_strict(rvd, mrvd);
 	if (copy_error != 0 || spa_load_print_vdev_tree) {
 		spa_load_note(spa, "provided vdev tree:");
 		vdev_dbgmsg_print_tree(rvd, 2);
 		spa_load_note(spa, "MOS vdev tree:");
 		vdev_dbgmsg_print_tree(mrvd, 2);
 	}
 	if (copy_error != 0) {
 		spa_load_note(spa, "vdev_copy_path_strict failed, falling "
 		    "back to vdev_copy_path_relaxed");
 		vdev_copy_path_relaxed(rvd, mrvd);
 	}
 
 	vdev_close(rvd);
 	vdev_free(rvd);
 	spa->spa_root_vdev = mrvd;
 	rvd = mrvd;
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * If 'zpool import' used a cached config, then the on-disk hostid and
 	 * hostname may be different to the cached config in ways that should
 	 * prevent import.  Userspace can't discover this without a scan, but
 	 * we know, so we add these values to LOAD_INFO so the caller can know
 	 * the difference.
 	 *
 	 * Note that we have to do this before the config is regenerated,
 	 * because the new config will have the hostid and hostname for this
 	 * host, in readiness for import.
 	 */
 	if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID))
 		fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID,
 		    fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID));
 	if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME))
 		fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME,
 		    fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME));
 
 	/*
 	 * We will use spa_config if we decide to reload the spa or if spa_load
 	 * fails and we rewind. We must thus regenerate the config using the
 	 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
 	 * pass settings on how to load the pool and is not stored in the MOS.
 	 * We copy it over to our new, trusted config.
 	 */
 	mos_config_txg = fnvlist_lookup_uint64(mos_config,
 	    ZPOOL_CONFIG_POOL_TXG);
 	nvlist_free(mos_config);
 	mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
 	if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
 	    &policy) == 0)
 		fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
 	spa_config_set(spa, mos_config);
 	spa->spa_config_source = SPA_CONFIG_SRC_MOS;
 
 	/*
 	 * Now that we got the config from the MOS, we should be more strict
 	 * in checking blkptrs and can make assumptions about the consistency
 	 * of the vdev tree. spa_trust_config must be set to true before opening
 	 * vdevs in order for them to be writeable.
 	 */
 	spa->spa_trust_config = B_TRUE;
 
 	/*
 	 * Open and validate the new vdev tree
 	 */
 	error = spa_ld_open_vdevs(spa);
 	if (error != 0)
 		return (error);
 
 	error = spa_ld_validate_vdevs(spa);
 	if (error != 0)
 		return (error);
 
 	if (copy_error != 0 || spa_load_print_vdev_tree) {
 		spa_load_note(spa, "final vdev tree:");
 		vdev_dbgmsg_print_tree(rvd, 2);
 	}
 
 	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
 	    !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
 		/*
 		 * Sanity check to make sure that we are indeed loading the
 		 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
 		 * in the config provided and they happened to be the only ones
 		 * to have the latest uberblock, we could involuntarily perform
 		 * an extreme rewind.
 		 */
 		healthy_tvds_mos = spa_healthy_core_tvds(spa);
 		if (healthy_tvds_mos - healthy_tvds >=
 		    SPA_SYNC_MIN_VDEVS) {
 			spa_load_note(spa, "config provided misses too many "
 			    "top-level vdevs compared to MOS (%lld vs %lld). ",
 			    (u_longlong_t)healthy_tvds,
 			    (u_longlong_t)healthy_tvds_mos);
 			spa_load_note(spa, "vdev tree:");
 			vdev_dbgmsg_print_tree(rvd, 2);
 			if (reloading) {
 				spa_load_failed(spa, "config was already "
 				    "provided from MOS. Aborting.");
 				return (spa_vdev_err(rvd,
 				    VDEV_AUX_CORRUPT_DATA, EIO));
 			}
 			spa_load_note(spa, "spa must be reloaded using MOS "
 			    "config");
 			return (SET_ERROR(EAGAIN));
 		}
 	}
 
 	error = spa_check_for_missing_logs(spa);
 	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
 
 	if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
 		spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
 		    "guid sum (%llu != %llu)",
 		    (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
 		    (u_longlong_t)rvd->vdev_guid_sum);
 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
 		    ENXIO));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_open_indirect_vdev_metadata(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * Everything that we read before spa_remove_init() must be stored
 	 * on concreted vdevs.  Therefore we do this as early as possible.
 	 */
 	error = spa_remove_init(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "spa_remove_init failed [error=%d]",
 		    error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	/*
 	 * Retrieve information needed to condense indirect vdev mappings.
 	 */
 	error = spa_condense_init(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "spa_condense_init failed [error=%d]",
 		    error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
 		boolean_t missing_feat_read = B_FALSE;
 		nvlist_t *unsup_feat, *enabled_feat;
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
 		    &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
 		    &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
 		    &spa->spa_feat_desc_obj, B_TRUE) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		enabled_feat = fnvlist_alloc();
 		unsup_feat = fnvlist_alloc();
 
 		if (!spa_features_check(spa, B_FALSE,
 		    unsup_feat, enabled_feat))
 			missing_feat_read = B_TRUE;
 
 		if (spa_writeable(spa) ||
 		    spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
 			if (!spa_features_check(spa, B_TRUE,
 			    unsup_feat, enabled_feat)) {
 				*missing_feat_writep = B_TRUE;
 			}
 		}
 
 		fnvlist_add_nvlist(spa->spa_load_info,
 		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
 
 		if (!nvlist_empty(unsup_feat)) {
 			fnvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
 		}
 
 		fnvlist_free(enabled_feat);
 		fnvlist_free(unsup_feat);
 
 		if (!missing_feat_read) {
 			fnvlist_add_boolean(spa->spa_load_info,
 			    ZPOOL_CONFIG_CAN_RDONLY);
 		}
 
 		/*
 		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
 		 * twofold: to determine whether the pool is available for
 		 * import in read-write mode and (if it is not) whether the
 		 * pool is available for import in read-only mode. If the pool
 		 * is available for import in read-write mode, it is displayed
 		 * as available in userland; if it is not available for import
 		 * in read-only mode, it is displayed as unavailable in
 		 * userland. If the pool is available for import in read-only
 		 * mode but not read-write mode, it is displayed as unavailable
 		 * in userland with a special note that the pool is actually
 		 * available for open in read-only mode.
 		 *
 		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
 		 * missing a feature for write, we must first determine whether
 		 * the pool can be opened read-only before returning to
 		 * userland in order to know whether to display the
 		 * abovementioned note.
 		 */
 		if (missing_feat_read || (*missing_feat_writep &&
 		    spa_writeable(spa))) {
 			spa_load_failed(spa, "pool uses unsupported features");
 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 			    ENOTSUP));
 		}
 
 		/*
 		 * Load refcounts for ZFS features from disk into an in-memory
 		 * cache during SPA initialization.
 		 */
 		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
 			uint64_t refcount;
 
 			error = feature_get_refcount_from_disk(spa,
 			    &spa_feature_table[i], &refcount);
 			if (error == 0) {
 				spa->spa_feat_refcount_cache[i] = refcount;
 			} else if (error == ENOTSUP) {
 				spa->spa_feat_refcount_cache[i] =
 				    SPA_FEATURE_DISABLED;
 			} else {
 				spa_load_failed(spa, "error getting refcount "
 				    "for feature %s [error=%d]",
 				    spa_feature_table[i].fi_guid, error);
 				return (spa_vdev_err(rvd,
 				    VDEV_AUX_CORRUPT_DATA, EIO));
 			}
 		}
 	}
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
 		    &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	/*
 	 * Encryption was added before bookmark_v2, even though bookmark_v2
 	 * is now a dependency. If this pool has encryption enabled without
 	 * bookmark_v2, trigger an errata message.
 	 */
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) {
 		spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION;
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_load_special_directories(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	spa->spa_is_initializing = B_TRUE;
 	error = dsl_pool_open(spa->spa_dsl_pool);
 	spa->spa_is_initializing = B_FALSE;
 	if (error != 0) {
 		spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_get_props(spa_t *spa)
 {
 	int error = 0;
 	uint64_t obj;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/* Grab the checksum salt from the MOS. */
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_CHECKSUM_SALT, 1,
 	    sizeof (spa->spa_cksum_salt.zcs_bytes),
 	    spa->spa_cksum_salt.zcs_bytes);
 	if (error == ENOENT) {
 		/* Generate a new salt for subsequent use */
 		(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
 		    sizeof (spa->spa_cksum_salt.zcs_bytes));
 	} else if (error != 0) {
 		spa_load_failed(spa, "unable to retrieve checksum salt from "
 		    "MOS [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
 	if (error != 0) {
 		spa_load_failed(spa, "error opening deferred-frees bpobj "
 		    "[error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	/*
 	 * Load the bit that tells us to use the new accounting function
 	 * (raid-z deflation).  If we have an older pool, this will not
 	 * be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
 	    &spa->spa_creation_version, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the persistent error log.  If we have an older pool, this will
 	 * not be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
 	    B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
 	    &spa->spa_errlog_scrub, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the livelist deletion field. If a livelist is queued for
 	 * deletion, indicate that in the spa
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES,
 	    &spa->spa_livelists_to_delete, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the history object.  If we have an older pool, this
 	 * will not be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the per-vdev ZAP map. If we have an older pool, this will not
 	 * be present; in this case, defer its creation to a later time to
 	 * avoid dirtying the MOS this early / out of sync context. See
 	 * spa_sync_config_object.
 	 */
 
 	/* The sentinel is only available in the MOS config. */
 	nvlist_t *mos_config;
 	if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
 		spa_load_failed(spa, "unable to retrieve MOS config");
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
 	    &spa->spa_all_vdev_zaps, B_FALSE);
 
 	if (error == ENOENT) {
 		VERIFY(!nvlist_exists(mos_config,
 		    ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
 		spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
 		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
 	} else if (error != 0) {
 		nvlist_free(mos_config);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	} else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
 		/*
 		 * An older version of ZFS overwrote the sentinel value, so
 		 * we have orphaned per-vdev ZAPs in the MOS. Defer their
 		 * destruction to later; see spa_sync_config_object.
 		 */
 		spa->spa_avz_action = AVZ_ACTION_DESTROY;
 		/*
 		 * We're assuming that no vdevs have had their ZAPs created
 		 * before this. Better be sure of it.
 		 */
 		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
 	}
 	nvlist_free(mos_config);
 
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 
 	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
 	    B_FALSE);
 	if (error && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (error == 0) {
 		uint64_t autoreplace = 0;
 
 		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
 		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
 		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
 		spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
 		spa->spa_autoreplace = (autoreplace != 0);
 	}
 
 	/*
 	 * If we are importing a pool with missing top-level vdevs,
 	 * we enforce that the pool doesn't panic or get suspended on
 	 * error since the likelihood of missing data is extremely high.
 	 */
 	if (spa->spa_missing_tvds > 0 &&
 	    spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
 	    spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
 		spa_load_note(spa, "forcing failmode to 'continue' "
 		    "as some top level vdevs are missing");
 		spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * If we're assembling the pool from the split-off vdevs of
 	 * an existing pool, we don't want to attach the spares & cache
 	 * devices.
 	 */
 
 	/*
 	 * Load any hot spares for this pool.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
 	    B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
 		if (load_nvlist(spa, spa->spa_spares.sav_object,
 		    &spa->spa_spares.sav_config) != 0) {
 			spa_load_failed(spa, "error loading spares nvlist");
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	} else if (error == 0) {
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Load any level 2 ARC devices for this pool.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
 	    &spa->spa_l2cache.sav_object, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
 		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
 		    &spa->spa_l2cache.sav_config) != 0) {
 			spa_load_failed(spa, "error loading l2cache nvlist");
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	} else if (error == 0) {
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_load_vdev_metadata(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * If the 'multihost' property is set, then never allow a pool to
 	 * be imported when the system hostid is zero.  The exception to
 	 * this rule is zdb which is always allowed to access pools.
 	 */
 	if (spa_multihost(spa) && spa_get_hostid(spa) == 0 &&
 	    (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
 		return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
 	}
 
 	/*
 	 * If the 'autoreplace' property is set, then post a resource notifying
 	 * the ZFS DE that it should not issue any faults for unopenable
 	 * devices.  We also iterate over the vdevs, and post a sysevent for any
 	 * unopenable vdevs so that the normal autoreplace handler can take
 	 * over.
 	 */
 	if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
 		spa_check_removed(spa->spa_root_vdev);
 		/*
 		 * For the import case, this is done in spa_import(), because
 		 * at this point we're using the spare definitions from
 		 * the MOS config, not necessarily from the userland config.
 		 */
 		if (spa->spa_load_state != SPA_LOAD_IMPORT) {
 			spa_aux_check_removed(&spa->spa_spares);
 			spa_aux_check_removed(&spa->spa_l2cache);
 		}
 	}
 
 	/*
 	 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
 	 */
 	error = vdev_load(rvd);
 	if (error != 0) {
 		spa_load_failed(spa, "vdev_load failed [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 	}
 
 	error = spa_ld_log_spacemaps(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]",
 		    error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 	}
 
 	/*
 	 * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	return (0);
 }
 
 static int
 spa_ld_load_dedup_tables(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	error = ddt_load(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "ddt_load failed [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_load_brt(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	error = brt_load(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "brt_load failed [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
 		boolean_t missing = spa_check_logs(spa);
 		if (missing) {
 			if (spa->spa_missing_tvds != 0) {
 				spa_load_note(spa, "spa_check_logs failed "
 				    "so dropping the logs");
 			} else {
 				*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
 				spa_load_failed(spa, "spa_check_logs failed");
 				return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
 				    ENXIO));
 			}
 		}
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_verify_pool_data(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * We've successfully opened the pool, verify that we're ready
 	 * to start pushing transactions.
 	 */
 	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
 		error = spa_load_verify(spa);
 		if (error != 0) {
 			spa_load_failed(spa, "spa_load_verify failed "
 			    "[error=%d]", error);
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    error));
 		}
 	}
 
 	return (0);
 }
 
 static void
 spa_ld_claim_log_blocks(spa_t *spa)
 {
 	dmu_tx_t *tx;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 
 	/*
 	 * Claim log blocks that haven't been committed yet.
 	 * This must all happen in a single txg.
 	 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
 	 * invoked from zil_claim_log_block()'s i/o done callback.
 	 * Price of rollback is that we abandon the log.
 	 */
 	spa->spa_claiming = B_TRUE;
 
 	tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
 	(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 	    zil_claim, tx, DS_FIND_CHILDREN);
 	dmu_tx_commit(tx);
 
 	spa->spa_claiming = B_FALSE;
 
 	spa_set_log_state(spa, SPA_LOG_GOOD);
 }
 
 static void
 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
     boolean_t update_config_cache)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	int need_update = B_FALSE;
 
 	/*
 	 * If the config cache is stale, or we have uninitialized
 	 * metaslabs (see spa_vdev_add()), then update the config.
 	 *
 	 * If this is a verbatim import, trust the current
 	 * in-core spa_config and update the disk labels.
 	 */
 	if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
 	    spa->spa_load_state == SPA_LOAD_IMPORT ||
 	    spa->spa_load_state == SPA_LOAD_RECOVER ||
 	    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
 		need_update = B_TRUE;
 
 	for (int c = 0; c < rvd->vdev_children; c++)
 		if (rvd->vdev_child[c]->vdev_ms_array == 0)
 			need_update = B_TRUE;
 
 	/*
 	 * Update the config cache asynchronously in case we're the
 	 * root pool, in which case the config cache isn't writable yet.
 	 */
 	if (need_update)
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 }
 
 static void
 spa_ld_prepare_for_reload(spa_t *spa)
 {
 	spa_mode_t mode = spa->spa_mode;
 	int async_suspended = spa->spa_async_suspended;
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 	spa_activate(spa, mode);
 
 	/*
 	 * We save the value of spa_async_suspended as it gets reset to 0 by
 	 * spa_unload(). We want to restore it back to the original value before
 	 * returning as we might be calling spa_async_resume() later.
 	 */
 	spa->spa_async_suspended = async_suspended;
 }
 
 static int
 spa_ld_read_checkpoint_txg(spa_t *spa)
 {
 	uberblock_t checkpoint;
 	int error = 0;
 
 	ASSERT0(spa->spa_checkpoint_txg);
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
 
 	if (error == ENOENT)
 		return (0);
 
 	if (error != 0)
 		return (error);
 
 	ASSERT3U(checkpoint.ub_txg, !=, 0);
 	ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
 	ASSERT3U(checkpoint.ub_timestamp, !=, 0);
 	spa->spa_checkpoint_txg = checkpoint.ub_txg;
 	spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
 
 	return (0);
 }
 
 static int
 spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
 {
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
 
 	/*
 	 * Never trust the config that is provided unless we are assembling
 	 * a pool following a split.
 	 * This means don't trust blkptrs and the vdev tree in general. This
 	 * also effectively puts the spa in read-only mode since
 	 * spa_writeable() checks for spa_trust_config to be true.
 	 * We will later load a trusted config from the MOS.
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE)
 		spa->spa_trust_config = B_FALSE;
 
 	/*
 	 * Parse the config provided to create a vdev tree.
 	 */
 	error = spa_ld_parse_config(spa, type);
 	if (error != 0)
 		return (error);
 
 	spa_import_progress_add(spa);
 
 	/*
 	 * Now that we have the vdev tree, try to open each vdev. This involves
 	 * opening the underlying physical device, retrieving its geometry and
 	 * probing the vdev with a dummy I/O. The state of each vdev will be set
 	 * based on the success of those operations. After this we'll be ready
 	 * to read from the vdevs.
 	 */
 	error = spa_ld_open_vdevs(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Read the label of each vdev and make sure that the GUIDs stored
 	 * there match the GUIDs in the config provided.
 	 * If we're assembling a new pool that's been split off from an
 	 * existing pool, the labels haven't yet been updated so we skip
 	 * validation for now.
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		error = spa_ld_validate_vdevs(spa);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Read all vdev labels to find the best uberblock (i.e. latest,
 	 * unless spa_load_max_txg is set) and store it in spa_uberblock. We
 	 * get the list of features required to read blkptrs in the MOS from
 	 * the vdev label with the best uberblock and verify that our version
 	 * of zfs supports them all.
 	 */
 	error = spa_ld_select_uberblock(spa, type);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Pass that uberblock to the dsl_pool layer which will open the root
 	 * blkptr. This blkptr points to the latest version of the MOS and will
 	 * allow us to read its contents.
 	 */
 	error = spa_ld_open_rootbp(spa);
 	if (error != 0)
 		return (error);
 
 	return (0);
 }
 
 static int
 spa_ld_checkpoint_rewind(spa_t *spa)
 {
 	uberblock_t checkpoint;
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
 
 	if (error != 0) {
 		spa_load_failed(spa, "unable to retrieve checkpointed "
 		    "uberblock from the MOS config [error=%d]", error);
 
 		if (error == ENOENT)
 			error = ZFS_ERR_NO_CHECKPOINT;
 
 		return (error);
 	}
 
 	ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
 	ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
 
 	/*
 	 * We need to update the txg and timestamp of the checkpointed
 	 * uberblock to be higher than the latest one. This ensures that
 	 * the checkpointed uberblock is selected if we were to close and
 	 * reopen the pool right after we've written it in the vdev labels.
 	 * (also see block comment in vdev_uberblock_compare)
 	 */
 	checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
 	checkpoint.ub_timestamp = gethrestime_sec();
 
 	/*
 	 * Set current uberblock to be the checkpointed uberblock.
 	 */
 	spa->spa_uberblock = checkpoint;
 
 	/*
 	 * If we are doing a normal rewind, then the pool is open for
 	 * writing and we sync the "updated" checkpointed uberblock to
 	 * disk. Once this is done, we've basically rewound the whole
 	 * pool and there is no way back.
 	 *
 	 * There are cases when we don't want to attempt and sync the
 	 * checkpointed uberblock to disk because we are opening a
 	 * pool as read-only. Specifically, verifying the checkpointed
 	 * state with zdb, and importing the checkpointed state to get
 	 * a "preview" of its content.
 	 */
 	if (spa_writeable(spa)) {
 		vdev_t *rvd = spa->spa_root_vdev;
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
 		int svdcount = 0;
 		int children = rvd->vdev_children;
 		int c0 = random_in_range(children);
 
 		for (int c = 0; c < children; c++) {
 			vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
 
 			/* Stop when revisiting the first vdev */
 			if (c > 0 && svd[0] == vd)
 				break;
 
 			if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
 			    !vdev_is_concrete(vd))
 				continue;
 
 			svd[svdcount++] = vd;
 			if (svdcount == SPA_SYNC_MIN_VDEVS)
 				break;
 		}
 		error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
 		if (error == 0)
 			spa->spa_last_synced_guid = rvd->vdev_guid;
 		spa_config_exit(spa, SCL_ALL, FTAG);
 
 		if (error != 0) {
 			spa_load_failed(spa, "failed to write checkpointed "
 			    "uberblock to the vdev labels [error=%d]", error);
 			return (error);
 		}
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
     boolean_t *update_config_cache)
 {
 	int error;
 
 	/*
 	 * Parse the config for pool, open and validate vdevs,
 	 * select an uberblock, and use that uberblock to open
 	 * the MOS.
 	 */
 	error = spa_ld_mos_init(spa, type);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Retrieve the trusted config stored in the MOS and use it to create
 	 * a new, exact version of the vdev tree, then reopen all vdevs.
 	 */
 	error = spa_ld_trusted_config(spa, type, B_FALSE);
 	if (error == EAGAIN) {
 		if (update_config_cache != NULL)
 			*update_config_cache = B_TRUE;
 
 		/*
 		 * Redo the loading process with the trusted config if it is
 		 * too different from the untrusted config.
 		 */
 		spa_ld_prepare_for_reload(spa);
 		spa_load_note(spa, "RELOADING");
 		error = spa_ld_mos_init(spa, type);
 		if (error != 0)
 			return (error);
 
 		error = spa_ld_trusted_config(spa, type, B_TRUE);
 		if (error != 0)
 			return (error);
 
 	} else if (error != 0) {
 		return (error);
 	}
 
 	return (0);
 }
 
 /*
  * Load an existing storage pool, using the config provided. This config
  * describes which vdevs are part of the pool and is later validated against
  * partial configs present in each vdev's label and an entire copy of the
  * config stored in the MOS.
  */
 static int
 spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 {
 	int error = 0;
 	boolean_t missing_feat_write = B_FALSE;
 	boolean_t checkpoint_rewind =
 	    (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 	boolean_t update_config_cache = B_FALSE;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
 
 	spa_load_note(spa, "LOADING");
 
 	error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * If we are rewinding to the checkpoint then we need to repeat
 	 * everything we've done so far in this function but this time
 	 * selecting the checkpointed uberblock and using that to open
 	 * the MOS.
 	 */
 	if (checkpoint_rewind) {
 		/*
 		 * If we are rewinding to the checkpoint update config cache
 		 * anyway.
 		 */
 		update_config_cache = B_TRUE;
 
 		/*
 		 * Extract the checkpointed uberblock from the current MOS
 		 * and use this as the pool's uberblock from now on. If the
 		 * pool is imported as writeable we also write the checkpoint
 		 * uberblock to the labels, making the rewind permanent.
 		 */
 		error = spa_ld_checkpoint_rewind(spa);
 		if (error != 0)
 			return (error);
 
 		/*
 		 * Redo the loading process again with the
 		 * checkpointed uberblock.
 		 */
 		spa_ld_prepare_for_reload(spa);
 		spa_load_note(spa, "LOADING checkpointed uberblock");
 		error = spa_ld_mos_with_trusted_config(spa, type, NULL);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Retrieve the checkpoint txg if the pool has a checkpoint.
 	 */
 	spa_import_progress_set_notes(spa, "Loading checkpoint txg");
 	error = spa_ld_read_checkpoint_txg(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
 	 * from the pool and their contents were re-mapped to other vdevs. Note
 	 * that everything that we read before this step must have been
 	 * rewritten on concrete vdevs after the last device removal was
 	 * initiated. Otherwise we could be reading from indirect vdevs before
 	 * we have loaded their mappings.
 	 */
 	spa_import_progress_set_notes(spa, "Loading indirect vdev metadata");
 	error = spa_ld_open_indirect_vdev_metadata(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Retrieve the full list of active features from the MOS and check if
 	 * they are all supported.
 	 */
 	spa_import_progress_set_notes(spa, "Checking feature flags");
 	error = spa_ld_check_features(spa, &missing_feat_write);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Load several special directories from the MOS needed by the dsl_pool
 	 * layer.
 	 */
 	spa_import_progress_set_notes(spa, "Loading special MOS directories");
 	error = spa_ld_load_special_directories(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Retrieve pool properties from the MOS.
 	 */
 	spa_import_progress_set_notes(spa, "Loading properties");
 	error = spa_ld_get_props(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Retrieve the list of auxiliary devices - cache devices and spares -
 	 * and open them.
 	 */
 	spa_import_progress_set_notes(spa, "Loading AUX vdevs");
 	error = spa_ld_open_aux_vdevs(spa, type);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Load the metadata for all vdevs. Also check if unopenable devices
 	 * should be autoreplaced.
 	 */
 	spa_import_progress_set_notes(spa, "Loading vdev metadata");
 	error = spa_ld_load_vdev_metadata(spa);
 	if (error != 0)
 		return (error);
 
 	spa_import_progress_set_notes(spa, "Loading dedup tables");
 	error = spa_ld_load_dedup_tables(spa);
 	if (error != 0)
 		return (error);
 
 	spa_import_progress_set_notes(spa, "Loading BRT");
 	error = spa_ld_load_brt(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Verify the logs now to make sure we don't have any unexpected errors
 	 * when we claim log blocks later.
 	 */
 	spa_import_progress_set_notes(spa, "Verifying Log Devices");
 	error = spa_ld_verify_logs(spa, type, ereport);
 	if (error != 0)
 		return (error);
 
 	if (missing_feat_write) {
 		ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * At this point, we know that we can open the pool in
 		 * read-only mode but not read-write mode. We now have enough
 		 * information and can return to userland.
 		 */
 		return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
 		    ENOTSUP));
 	}
 
 	/*
 	 * Traverse the last txgs to make sure the pool was left off in a safe
 	 * state. When performing an extreme rewind, we verify the whole pool,
 	 * which can take a very long time.
 	 */
 	spa_import_progress_set_notes(spa, "Verifying pool data");
 	error = spa_ld_verify_pool_data(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Calculate the deflated space for the pool. This must be done before
 	 * we write anything to the pool because we'd need to update the space
 	 * accounting using the deflated sizes.
 	 */
 	spa_import_progress_set_notes(spa, "Calculating deflated space");
 	spa_update_dspace(spa);
 
 	/*
 	 * We have now retrieved all the information we needed to open the
 	 * pool. If we are importing the pool in read-write mode, a few
 	 * additional steps must be performed to finish the import.
 	 */
 	spa_import_progress_set_notes(spa, "Starting import");
 	if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
 	    spa->spa_load_max_txg == UINT64_MAX)) {
 		uint64_t config_cache_txg = spa->spa_config_txg;
 
 		ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * Before we do any zio_write's, complete the raidz expansion
 		 * scratch space copying, if necessary.
 		 */
 		if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID)
 			vdev_raidz_reflow_copy_scratch(spa);
 
 		/*
 		 * In case of a checkpoint rewind, log the original txg
 		 * of the checkpointed uberblock.
 		 */
 		if (checkpoint_rewind) {
 			spa_history_log_internal(spa, "checkpoint rewind",
 			    NULL, "rewound state to txg=%llu",
 			    (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
 		}
 
 		spa_import_progress_set_notes(spa, "Claiming ZIL blocks");
 		/*
 		 * Traverse the ZIL and claim all blocks.
 		 */
 		spa_ld_claim_log_blocks(spa);
 
 		/*
 		 * Kick-off the syncing thread.
 		 */
 		spa->spa_sync_on = B_TRUE;
 		txg_sync_start(spa->spa_dsl_pool);
 		mmp_thread_start(spa);
 
 		/*
 		 * Wait for all claims to sync.  We sync up to the highest
 		 * claimed log block birth time so that claimed log blocks
 		 * don't appear to be from the future.  spa_claim_max_txg
 		 * will have been set for us by ZIL traversal operations
 		 * performed above.
 		 */
 		spa_import_progress_set_notes(spa, "Syncing ZIL claims");
 		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
 
 		/*
 		 * Check if we need to request an update of the config. On the
 		 * next sync, we would update the config stored in vdev labels
 		 * and the cachefile (by default /etc/zfs/zpool.cache).
 		 */
 		spa_import_progress_set_notes(spa, "Updating configs");
 		spa_ld_check_for_config_update(spa, config_cache_txg,
 		    update_config_cache);
 
 		/*
 		 * Check if a rebuild was in progress and if so resume it.
 		 * Then check all DTLs to see if anything needs resilvering.
 		 * The resilver will be deferred if a rebuild was started.
 		 */
 		spa_import_progress_set_notes(spa, "Starting resilvers");
 		if (vdev_rebuild_active(spa->spa_root_vdev)) {
 			vdev_rebuild_restart(spa);
 		} else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
 		    vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 		}
 
 		/*
 		 * Log the fact that we booted up (so that we can detect if
 		 * we rebooted in the middle of an operation).
 		 */
 		spa_history_log_version(spa, "open", NULL);
 
 		spa_import_progress_set_notes(spa,
 		    "Restarting device removals");
 		spa_restart_removal(spa);
 		spa_spawn_aux_threads(spa);
 
 		/*
 		 * Delete any inconsistent datasets.
 		 *
 		 * Note:
 		 * Since we may be issuing deletes for clones here,
 		 * we make sure to do so after we've spawned all the
 		 * auxiliary threads above (from which the livelist
 		 * deletion zthr is part of).
 		 */
 		spa_import_progress_set_notes(spa,
 		    "Cleaning up inconsistent objsets");
 		(void) dmu_objset_find(spa_name(spa),
 		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
 
 		/*
 		 * Clean up any stale temporary dataset userrefs.
 		 */
 		spa_import_progress_set_notes(spa,
 		    "Cleaning up temporary userrefs");
 		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
 
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_import_progress_set_notes(spa, "Restarting initialize");
 		vdev_initialize_restart(spa->spa_root_vdev);
 		spa_import_progress_set_notes(spa, "Restarting TRIM");
 		vdev_trim_restart(spa->spa_root_vdev);
 		vdev_autotrim_restart(spa);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_import_progress_set_notes(spa, "Finished importing");
 	}
 
 	spa_import_progress_remove(spa_guid(spa));
 	spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
 
 	spa_load_note(spa, "LOADED");
 
 	return (0);
 }
 
 static int
 spa_load_retry(spa_t *spa, spa_load_state_t state)
 {
 	spa_mode_t mode = spa->spa_mode;
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 
 	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
 
 	spa_activate(spa, mode);
 	spa_async_suspend(spa);
 
 	spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
 	    (u_longlong_t)spa->spa_load_max_txg);
 
 	return (spa_load(spa, state, SPA_IMPORT_EXISTING));
 }
 
 /*
  * If spa_load() fails this function will try loading prior txg's. If
  * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
  * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
  * function will not rewind the pool and will return the same error as
  * spa_load().
  */
 static int
 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
     int rewind_flags)
 {
 	nvlist_t *loadinfo = NULL;
 	nvlist_t *config = NULL;
 	int load_error, rewind_error;
 	uint64_t safe_rewind_txg;
 	uint64_t min_txg;
 
 	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
 		spa->spa_load_max_txg = spa->spa_load_txg;
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		spa->spa_load_max_txg = max_request;
 		if (max_request != UINT64_MAX)
 			spa->spa_extreme_rewind = B_TRUE;
 	}
 
 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
 	if (load_error == 0)
 		return (0);
 	if (load_error == ZFS_ERR_NO_CHECKPOINT) {
 		/*
 		 * When attempting checkpoint-rewind on a pool with no
 		 * checkpoint, we should not attempt to load uberblocks
 		 * from previous txgs when spa_load fails.
 		 */
 		ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 		spa_import_progress_remove(spa_guid(spa));
 		return (load_error);
 	}
 
 	if (spa->spa_root_vdev != NULL)
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
 	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
 	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
 
 	if (rewind_flags & ZPOOL_NEVER_REWIND) {
 		nvlist_free(config);
 		spa_import_progress_remove(spa_guid(spa));
 		return (load_error);
 	}
 
 	if (state == SPA_LOAD_RECOVER) {
 		/* Price of rolling back is discarding txgs, including log */
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		/*
 		 * If we aren't rolling back save the load info from our first
 		 * import attempt so that we can restore it after attempting
 		 * to rewind.
 		 */
 		loadinfo = spa->spa_load_info;
 		spa->spa_load_info = fnvlist_alloc();
 	}
 
 	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
 	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
 	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
 	    TXG_INITIAL : safe_rewind_txg;
 
 	/*
 	 * Continue as long as we're finding errors, we're still within
 	 * the acceptable rewind range, and we're still finding uberblocks
 	 */
 	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
 	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
 		if (spa->spa_load_max_txg < safe_rewind_txg)
 			spa->spa_extreme_rewind = B_TRUE;
 		rewind_error = spa_load_retry(spa, state);
 	}
 
 	spa->spa_extreme_rewind = B_FALSE;
 	spa->spa_load_max_txg = UINT64_MAX;
 
 	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
 		spa_config_set(spa, config);
 	else
 		nvlist_free(config);
 
 	if (state == SPA_LOAD_RECOVER) {
 		ASSERT3P(loadinfo, ==, NULL);
 		spa_import_progress_remove(spa_guid(spa));
 		return (rewind_error);
 	} else {
 		/* Store the rewind info as part of the initial load info */
 		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
 		    spa->spa_load_info);
 
 		/* Restore the initial load info */
 		fnvlist_free(spa->spa_load_info);
 		spa->spa_load_info = loadinfo;
 
 		spa_import_progress_remove(spa_guid(spa));
 		return (load_error);
 	}
 }
 
 /*
  * Pool Open/Import
  *
  * The import case is identical to an open except that the configuration is sent
  * down from userland, instead of grabbed from the configuration cache.  For the
  * case of an open, the pool configuration will exist in the
  * POOL_STATE_UNINITIALIZED state.
  *
  * The stats information (gen/count/ustats) is used to gather vdev statistics at
  * the same time open the pool, without having to keep around the spa_t in some
  * ambiguous state.
  */
 static int
 spa_open_common(const char *pool, spa_t **spapp, const void *tag,
     nvlist_t *nvpolicy, nvlist_t **config)
 {
 	spa_t *spa;
 	spa_load_state_t state = SPA_LOAD_OPEN;
 	int error;
 	int locked = B_FALSE;
 	int firstopen = B_FALSE;
 
 	*spapp = NULL;
 
 	/*
 	 * As disgusting as this is, we need to support recursive calls to this
 	 * function because dsl_dir_open() is called during spa_load(), and ends
 	 * up calling spa_open() again.  The real fix is to figure out how to
 	 * avoid dsl_dir_open() calling this in the first place.
 	 */
 	if (MUTEX_NOT_HELD(&spa_namespace_lock)) {
 		mutex_enter(&spa_namespace_lock);
 		locked = B_TRUE;
 	}
 
 	if ((spa = spa_lookup(pool)) == NULL) {
 		if (locked)
 			mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
 		zpool_load_policy_t policy;
 
 		firstopen = B_TRUE;
 
 		zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
 		    &policy);
 		if (policy.zlp_rewind & ZPOOL_DO_REWIND)
 			state = SPA_LOAD_RECOVER;
 
 		spa_activate(spa, spa_mode_global);
 
 		if (state != SPA_LOAD_RECOVER)
 			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
 
 		zfs_dbgmsg("spa_open_common: opening %s", pool);
 		error = spa_load_best(spa, state, policy.zlp_txg,
 		    policy.zlp_rewind);
 
 		if (error == EBADF) {
 			/*
 			 * If vdev_validate() returns failure (indicated by
 			 * EBADF), it indicates that one of the vdevs indicates
 			 * that the pool has been exported or destroyed.  If
 			 * this is the case, the config cache is out of sync and
 			 * we should remove the pool from the namespace.
 			 */
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
 			spa_remove(spa);
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			return (SET_ERROR(ENOENT));
 		}
 
 		if (error) {
 			/*
 			 * We can't open the pool, but we still have useful
 			 * information: the state of each vdev after the
 			 * attempted vdev_open().  Return this to the user.
 			 */
 			if (config != NULL && spa->spa_config) {
 				*config = fnvlist_dup(spa->spa_config);
 				fnvlist_add_nvlist(*config,
 				    ZPOOL_CONFIG_LOAD_INFO,
 				    spa->spa_load_info);
 			}
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa->spa_last_open_failed = error;
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			*spapp = NULL;
 			return (error);
 		}
 	}
 
 	spa_open_ref(spa, tag);
 
 	if (config != NULL)
 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
 	/*
 	 * If we've recovered the pool, pass back any information we
 	 * gathered while doing the load.
 	 */
 	if (state == SPA_LOAD_RECOVER && config != NULL) {
 		fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
 		    spa->spa_load_info);
 	}
 
 	if (locked) {
 		spa->spa_last_open_failed = 0;
 		spa->spa_last_ubsync_txg = 0;
 		spa->spa_load_txg = 0;
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	if (firstopen)
 		zvol_create_minors_recursive(spa_name(spa));
 
 	*spapp = spa;
 
 	return (0);
 }
 
 int
 spa_open_rewind(const char *name, spa_t **spapp, const void *tag,
     nvlist_t *policy, nvlist_t **config)
 {
 	return (spa_open_common(name, spapp, tag, policy, config));
 }
 
 int
 spa_open(const char *name, spa_t **spapp, const void *tag)
 {
 	return (spa_open_common(name, spapp, tag, NULL, NULL));
 }
 
 /*
  * Lookup the given spa_t, incrementing the inject count in the process,
  * preventing it from being exported or destroyed.
  */
 spa_t *
 spa_inject_addref(char *name)
 {
 	spa_t *spa;
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(name)) == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (NULL);
 	}
 	spa->spa_inject_ref++;
 	mutex_exit(&spa_namespace_lock);
 
 	return (spa);
 }
 
 void
 spa_inject_delref(spa_t *spa)
 {
 	mutex_enter(&spa_namespace_lock);
 	spa->spa_inject_ref--;
 	mutex_exit(&spa_namespace_lock);
 }
 
 /*
  * Add spares device information to the nvlist.
  */
 static void
 spa_add_spares(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t **spares;
 	uint_t i, nspares;
 	nvlist_t *nvroot;
 	uint64_t guid;
 	vdev_stat_t *vs;
 	uint_t vsc;
 	uint64_t pool;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (spa->spa_spares.sav_count == 0)
 		return;
 
 	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
 	VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares));
 	if (nspares != 0) {
 		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 		    (const nvlist_t * const *)spares, nspares);
 		VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 		    &spares, &nspares));
 
 		/*
 		 * Go through and find any spares which have since been
 		 * repurposed as an active spare.  If this is the case, update
 		 * their status appropriately.
 		 */
 		for (i = 0; i < nspares; i++) {
 			guid = fnvlist_lookup_uint64(spares[i],
 			    ZPOOL_CONFIG_GUID);
 			VERIFY0(nvlist_lookup_uint64_array(spares[i],
 			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc));
 			if (spa_spare_exists(guid, &pool, NULL) &&
 			    pool != 0ULL) {
 				vs->vs_state = VDEV_STATE_CANT_OPEN;
 				vs->vs_aux = VDEV_AUX_SPARED;
 			} else {
 				vs->vs_state =
 				    spa->spa_spares.sav_vdevs[i]->vdev_state;
 			}
 		}
 	}
 }
 
 /*
  * Add l2cache device information to the nvlist, including vdev stats.
  */
 static void
 spa_add_l2cache(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t **l2cache;
 	uint_t i, j, nl2cache;
 	nvlist_t *nvroot;
 	uint64_t guid;
 	vdev_t *vd;
 	vdev_stat_t *vs;
 	uint_t vsc;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (spa->spa_l2cache.sav_count == 0)
 		return;
 
 	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
 	VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
 	if (nl2cache != 0) {
 		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 		    (const nvlist_t * const *)l2cache, nl2cache);
 		VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 		    &l2cache, &nl2cache));
 
 		/*
 		 * Update level 2 cache device stats.
 		 */
 
 		for (i = 0; i < nl2cache; i++) {
 			guid = fnvlist_lookup_uint64(l2cache[i],
 			    ZPOOL_CONFIG_GUID);
 
 			vd = NULL;
 			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
 				if (guid ==
 				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
 					vd = spa->spa_l2cache.sav_vdevs[j];
 					break;
 				}
 			}
 			ASSERT(vd != NULL);
 
 			VERIFY0(nvlist_lookup_uint64_array(l2cache[i],
 			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc));
 			vdev_get_stats(vd, vs);
 			vdev_config_generate_stats(vd, l2cache[i]);
 
 		}
 	}
 }
 
 static void
 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	if (spa->spa_feat_for_read_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_feat_for_read_obj);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
 			    za.za_num_integers == 1);
 			VERIFY0(nvlist_add_uint64(features, za.za_name,
 			    za.za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 	}
 
 	if (spa->spa_feat_for_write_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_feat_for_write_obj);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
 			    za.za_num_integers == 1);
 			VERIFY0(nvlist_add_uint64(features, za.za_name,
 			    za.za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 	}
 }
 
 static void
 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
 {
 	int i;
 
 	for (i = 0; i < SPA_FEATURES; i++) {
 		zfeature_info_t feature = spa_feature_table[i];
 		uint64_t refcount;
 
 		if (feature_get_refcount(spa, &feature, &refcount) != 0)
 			continue;
 
 		VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
 	}
 }
 
 /*
  * Store a list of pool features and their reference counts in the
  * config.
  *
  * The first time this is called on a spa, allocate a new nvlist, fetch
  * the pool features and reference counts from disk, then save the list
  * in the spa. In subsequent calls on the same spa use the saved nvlist
  * and refresh its values from the cached reference counts.  This
  * ensures we don't block here on I/O on a suspended pool so 'zpool
  * clear' can resume the pool.
  */
 static void
 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t *features;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	mutex_enter(&spa->spa_feat_stats_lock);
 	features = spa->spa_feat_stats;
 
 	if (features != NULL) {
 		spa_feature_stats_from_cache(spa, features);
 	} else {
 		VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
 		spa->spa_feat_stats = features;
 		spa_feature_stats_from_disk(spa, features);
 	}
 
 	VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
 	    features));
 
 	mutex_exit(&spa->spa_feat_stats_lock);
 }
 
 int
 spa_get_stats(const char *name, nvlist_t **config,
     char *altroot, size_t buflen)
 {
 	int error;
 	spa_t *spa;
 
 	*config = NULL;
 	error = spa_open_common(name, &spa, FTAG, NULL, config);
 
 	if (spa != NULL) {
 		/*
 		 * This still leaves a window of inconsistency where the spares
 		 * or l2cache devices could change and the config would be
 		 * self-inconsistent.
 		 */
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 		if (*config != NULL) {
 			uint64_t loadtimes[2];
 
 			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
 			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
 			fnvlist_add_uint64_array(*config,
 			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2);
 
 			fnvlist_add_uint64(*config,
 			    ZPOOL_CONFIG_ERRCOUNT,
 			    spa_approx_errlog_size(spa));
 
 			if (spa_suspended(spa)) {
 				fnvlist_add_uint64(*config,
 				    ZPOOL_CONFIG_SUSPENDED,
 				    spa->spa_failmode);
 				fnvlist_add_uint64(*config,
 				    ZPOOL_CONFIG_SUSPENDED_REASON,
 				    spa->spa_suspended);
 			}
 
 			spa_add_spares(spa, *config);
 			spa_add_l2cache(spa, *config);
 			spa_add_feature_stats(spa, *config);
 		}
 	}
 
 	/*
 	 * We want to get the alternate root even for faulted pools, so we cheat
 	 * and call spa_lookup() directly.
 	 */
 	if (altroot) {
 		if (spa == NULL) {
 			mutex_enter(&spa_namespace_lock);
 			spa = spa_lookup(name);
 			if (spa)
 				spa_altroot(spa, altroot, buflen);
 			else
 				altroot[0] = '\0';
 			spa = NULL;
 			mutex_exit(&spa_namespace_lock);
 		} else {
 			spa_altroot(spa, altroot, buflen);
 		}
 	}
 
 	if (spa != NULL) {
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_close(spa, FTAG);
 	}
 
 	return (error);
 }
 
 /*
  * Validate that the auxiliary device array is well formed.  We must have an
  * array of nvlists, each which describes a valid leaf vdev.  If this is an
  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
  * specified, as long as they are well-formed.
  */
 static int
 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
     spa_aux_vdev_t *sav, const char *config, uint64_t version,
     vdev_labeltype_t label)
 {
 	nvlist_t **dev;
 	uint_t i, ndev;
 	vdev_t *vd;
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/*
 	 * It's acceptable to have no devs specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
 		return (0);
 
 	if (ndev == 0)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Make sure the pool is formatted with a version that supports this
 	 * device type.
 	 */
 	if (spa_version(spa) < version)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Set the pending device list so we correctly handle device in-use
 	 * checking.
 	 */
 	sav->sav_pending = dev;
 	sav->sav_npending = ndev;
 
 	for (i = 0; i < ndev; i++) {
 		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
 		    mode)) != 0)
 			goto out;
 
 		if (!vd->vdev_ops->vdev_op_leaf) {
 			vdev_free(vd);
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		vd->vdev_top = vd;
 
 		if ((error = vdev_open(vd)) == 0 &&
 		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
 			fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
 			    vd->vdev_guid);
 		}
 
 		vdev_free(vd);
 
 		if (error &&
 		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
 			goto out;
 		else
 			error = 0;
 	}
 
 out:
 	sav->sav_pending = NULL;
 	sav->sav_npending = 0;
 	return (error);
 }
 
 static int
 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
 {
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
 	    VDEV_LABEL_SPARE)) != 0) {
 		return (error);
 	}
 
 	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
 	    VDEV_LABEL_L2CACHE));
 }
 
 static void
 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
     const char *config)
 {
 	int i;
 
 	if (sav->sav_config != NULL) {
 		nvlist_t **olddevs;
 		uint_t oldndevs;
 		nvlist_t **newdevs;
 
 		/*
 		 * Generate new dev list by concatenating with the
 		 * current dev list.
 		 */
 		VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config,
 		    &olddevs, &oldndevs));
 
 		newdevs = kmem_alloc(sizeof (void *) *
 		    (ndevs + oldndevs), KM_SLEEP);
 		for (i = 0; i < oldndevs; i++)
 			newdevs[i] = fnvlist_dup(olddevs[i]);
 		for (i = 0; i < ndevs; i++)
 			newdevs[i + oldndevs] = fnvlist_dup(devs[i]);
 
 		fnvlist_remove(sav->sav_config, config);
 
 		fnvlist_add_nvlist_array(sav->sav_config, config,
 		    (const nvlist_t * const *)newdevs, ndevs + oldndevs);
 		for (i = 0; i < oldndevs + ndevs; i++)
 			nvlist_free(newdevs[i]);
 		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
 	} else {
 		/*
 		 * Generate a new dev list.
 		 */
 		sav->sav_config = fnvlist_alloc();
 		fnvlist_add_nvlist_array(sav->sav_config, config,
 		    (const nvlist_t * const *)devs, ndevs);
 	}
 }
 
 /*
  * Stop and drop level 2 ARC devices
  */
 void
 spa_l2cache_drop(spa_t *spa)
 {
 	vdev_t *vd;
 	int i;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 	for (i = 0; i < sav->sav_count; i++) {
 		uint64_t pool;
 
 		vd = sav->sav_vdevs[i];
 		ASSERT(vd != NULL);
 
 		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 		    pool != 0ULL && l2arc_vdev_present(vd))
 			l2arc_remove_vdev(vd);
 	}
 }
 
 /*
  * Verify encryption parameters for spa creation. If we are encrypting, we must
  * have the encryption feature flag enabled.
  */
 static int
 spa_create_check_encryption_params(dsl_crypto_params_t *dcp,
     boolean_t has_encryption)
 {
 	if (dcp->cp_crypt != ZIO_CRYPT_OFF &&
 	    dcp->cp_crypt != ZIO_CRYPT_INHERIT &&
 	    !has_encryption)
 		return (SET_ERROR(ENOTSUP));
 
 	return (dmu_objset_create_crypt_check(NULL, dcp, NULL));
 }
 
 /*
  * Pool Creation
  */
 int
 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
     nvlist_t *zplprops, dsl_crypto_params_t *dcp)
 {
 	spa_t *spa;
 	const char *altroot = NULL;
 	vdev_t *rvd;
 	dsl_pool_t *dp;
 	dmu_tx_t *tx;
 	int error = 0;
 	uint64_t txg = TXG_INITIAL;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 	uint64_t version, obj, ndraid = 0;
 	boolean_t has_features;
 	boolean_t has_encryption;
 	boolean_t has_allocclass;
 	spa_feature_t feat;
 	const char *feat_name;
 	const char *poolname;
 	nvlist_t *nvl;
 
 	if (props == NULL ||
-	    nvlist_lookup_string(props, "tname", &poolname) != 0)
+	    nvlist_lookup_string(props,
+	    zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
 		poolname = (char *)pool;
 
 	/*
 	 * If this pool already exists, return failure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if (spa_lookup(poolname) != NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/*
 	 * Allocate a new spa_t structure.
 	 */
 	nvl = fnvlist_alloc();
 	fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	spa = spa_add(poolname, nvl, altroot);
 	fnvlist_free(nvl);
 	spa_activate(spa, spa_mode_global);
 
 	if (props && (error = spa_prop_validate(spa, props))) {
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	/*
 	 * Temporary pool names should never be written to disk.
 	 */
 	if (poolname != pool)
 		spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
 
 	has_features = B_FALSE;
 	has_encryption = B_FALSE;
 	has_allocclass = B_FALSE;
 	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
 	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
 		if (zpool_prop_feature(nvpair_name(elem))) {
 			has_features = B_TRUE;
 
 			feat_name = strchr(nvpair_name(elem), '@') + 1;
 			VERIFY0(zfeature_lookup_name(feat_name, &feat));
 			if (feat == SPA_FEATURE_ENCRYPTION)
 				has_encryption = B_TRUE;
 			if (feat == SPA_FEATURE_ALLOCATION_CLASSES)
 				has_allocclass = B_TRUE;
 		}
 	}
 
 	/* verify encryption params, if they were provided */
 	if (dcp != NULL) {
 		error = spa_create_check_encryption_params(dcp, has_encryption);
 		if (error != 0) {
 			spa_deactivate(spa);
 			spa_remove(spa);
 			mutex_exit(&spa_namespace_lock);
 			return (error);
 		}
 	}
 	if (!has_allocclass && zfs_special_devs(nvroot, NULL)) {
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (ENOTSUP);
 	}
 
 	if (has_features || nvlist_lookup_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
 		version = SPA_VERSION;
 	}
 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 
 	spa->spa_first_txg = txg;
 	spa->spa_uberblock.ub_txg = txg - 1;
 	spa->spa_uberblock.ub_version = version;
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa->spa_load_state = SPA_LOAD_CREATE;
 	spa->spa_removing_phys.sr_state = DSS_NONE;
 	spa->spa_removing_phys.sr_removing_vdev = -1;
 	spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
 	spa->spa_indirect_vdevs_loaded = B_TRUE;
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
 	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
 	    KM_SLEEP);
 	for (int i = 0; i < max_ncpus; i++) {
 		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 	}
 
 	/*
 	 * Create the root vdev.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
 
 	ASSERT(error != 0 || rvd != NULL);
 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
 
 	if (error == 0 && !zfs_allocatable_devs(nvroot))
 		error = SET_ERROR(EINVAL);
 
 	if (error == 0 &&
 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
 	    (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 &&
 	    (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) {
 		/*
 		 * instantiate the metaslab groups (this will dirty the vdevs)
 		 * we can no longer error exit past this point
 		 */
 		for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
 			vdev_t *vd = rvd->vdev_child[c];
 
 			vdev_metaslab_set_size(vd);
 			vdev_expand(vd, txg);
 		}
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	/*
 	 * Get the list of spares, if specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		spa->spa_spares.sav_config = fnvlist_alloc();
 		fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
 		    nspares);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Get the list of level 2 cache devices, if specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache) == 0) {
 		VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config,
 		    NV_UNIQUE_NAME, KM_SLEEP));
 		fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
 		    nl2cache);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	spa->spa_is_initializing = B_TRUE;
 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg);
 	spa->spa_is_initializing = B_FALSE;
 
 	/*
 	 * Create DDTs (dedup tables).
 	 */
 	ddt_create(spa);
 	/*
 	 * Create BRT table and BRT table object.
 	 */
 	brt_create(spa);
 
 	spa_update_dspace(spa);
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	/*
 	 * Create the pool's history object.
 	 */
 	if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history)
 		spa_history_create_obj(spa, tx);
 
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
 	spa_history_log_version(spa, "create", tx);
 
 	/*
 	 * Create the pool config object.
 	 */
 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
 	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add pool config");
 	}
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
 	    sizeof (uint64_t), 1, &version, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add pool version");
 	}
 
 	/* Newly created pools with the right version are always deflated. */
 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		spa->spa_deflate = TRUE;
 		if (zap_add(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
 			cmn_err(CE_PANIC, "failed to add deflate");
 		}
 	}
 
 	/*
 	 * Create the deferred-free bpobj.  Turn off compression
 	 * because sync-to-convergence takes longer if the blocksize
 	 * keeps changing.
 	 */
 	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
 	dmu_object_set_compress(spa->spa_meta_objset, obj,
 	    ZIO_COMPRESS_OFF, tx);
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
 	    sizeof (uint64_t), 1, &obj, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add bpobj");
 	}
 	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
 	    spa->spa_meta_objset, obj));
 
 	/*
 	 * Generate some random noise for salted checksums to operate on.
 	 */
 	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
 	    sizeof (spa->spa_cksum_salt.zcs_bytes));
 
 	/*
 	 * Set pool properties.
 	 */
 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
 	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
 	spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
 	spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
 
 	if (props != NULL) {
 		spa_configfile_set(spa, props, B_FALSE);
 		spa_sync_props(props, tx);
 	}
 
 	for (int i = 0; i < ndraid; i++)
 		spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
 
 	dmu_tx_commit(tx);
 
 	spa->spa_sync_on = B_TRUE;
 	txg_sync_start(dp);
 	mmp_thread_start(spa);
 	txg_wait_synced(dp, txg);
 
 	spa_spawn_aux_threads(spa);
 
 	spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
 
 	/*
 	 * Don't count references from objsets that are already closed
 	 * and are making their way through the eviction process.
 	 */
 	spa_evicting_os_wait(spa);
 	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
 	spa->spa_load_state = SPA_LOAD_NONE;
 
 	spa_import_os(spa);
 
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 /*
  * Import a non-root pool into the system.
  */
 int
 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 {
 	spa_t *spa;
 	const char *altroot = NULL;
 	spa_load_state_t state = SPA_LOAD_IMPORT;
 	zpool_load_policy_t policy;
 	spa_mode_t mode = spa_mode_global;
 	uint64_t readonly = B_FALSE;
 	int error;
 	nvlist_t *nvroot;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 
 	/*
 	 * If a pool with this name exists, return failure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if (spa_lookup(pool) != NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	(void) nvlist_lookup_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
 	if (readonly)
 		mode = SPA_MODE_READ;
 	spa = spa_add(pool, config, altroot);
 	spa->spa_import_flags = flags;
 
 	/*
 	 * Verbatim import - Take a pool and insert it into the namespace
 	 * as if it had been loaded at boot.
 	 */
 	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
 		if (props != NULL)
 			spa_configfile_set(spa, props, B_FALSE);
 
 		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
 		zfs_dbgmsg("spa_import: verbatim import of %s", pool);
 		mutex_exit(&spa_namespace_lock);
 		return (0);
 	}
 
 	spa_activate(spa, mode);
 
 	/*
 	 * Don't start async tasks until we know everything is healthy.
 	 */
 	spa_async_suspend(spa);
 
 	zpool_get_load_policy(config, &policy);
 	if (policy.zlp_rewind & ZPOOL_DO_REWIND)
 		state = SPA_LOAD_RECOVER;
 
 	spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
 
 	if (state != SPA_LOAD_RECOVER) {
 		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 		zfs_dbgmsg("spa_import: importing %s", pool);
 	} else {
 		zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
 		    "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
 	}
 	error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
 
 	/*
 	 * Propagate anything learned while loading the pool and pass it
 	 * back to caller (i.e. rewind info, missing devices, etc).
 	 */
 	fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	/*
 	 * Toss any existing sparelist, as it doesn't have any validity
 	 * anymore, and conflicts with spa_has_spare().
 	 */
 	if (spa->spa_spares.sav_config) {
 		nvlist_free(spa->spa_spares.sav_config);
 		spa->spa_spares.sav_config = NULL;
 		spa_load_spares(spa);
 	}
 	if (spa->spa_l2cache.sav_config) {
 		nvlist_free(spa->spa_l2cache.sav_config);
 		spa->spa_l2cache.sav_config = NULL;
 		spa_load_l2cache(spa);
 	}
 
 	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (props != NULL)
 		spa_configfile_set(spa, props, B_FALSE);
 
 	if (error != 0 || (props && spa_writeable(spa) &&
 	    (error = spa_prop_set(spa, props)))) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	spa_async_resume(spa);
 
 	/*
 	 * Override any spares and level 2 cache devices as specified by
 	 * the user, as these may have correct device names/devids, etc.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		if (spa->spa_spares.sav_config)
 			fnvlist_remove(spa->spa_spares.sav_config,
 			    ZPOOL_CONFIG_SPARES);
 		else
 			spa->spa_spares.sav_config = fnvlist_alloc();
 		fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
 		    nspares);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache) == 0) {
 		if (spa->spa_l2cache.sav_config)
 			fnvlist_remove(spa->spa_l2cache.sav_config,
 			    ZPOOL_CONFIG_L2CACHE);
 		else
 			spa->spa_l2cache.sav_config = fnvlist_alloc();
 		fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
 		    nl2cache);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Check for any removed devices.
 	 */
 	if (spa->spa_autoreplace) {
 		spa_aux_check_removed(&spa->spa_spares);
 		spa_aux_check_removed(&spa->spa_l2cache);
 	}
 
 	if (spa_writeable(spa)) {
 		/*
 		 * Update the config cache to include the newly-imported pool.
 		 */
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	}
 
 	/*
 	 * It's possible that the pool was expanded while it was exported.
 	 * We kick off an async task to handle this for us.
 	 */
 	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
 
 	spa_history_log_version(spa, "import", NULL);
 
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
 
 	mutex_exit(&spa_namespace_lock);
 
 	zvol_create_minors_recursive(pool);
 
 	spa_import_os(spa);
 
 	return (0);
 }
 
 nvlist_t *
 spa_tryimport(nvlist_t *tryconfig)
 {
 	nvlist_t *config = NULL;
 	const char *poolname, *cachefile;
 	spa_t *spa;
 	uint64_t state;
 	int error;
 	zpool_load_policy_t policy;
 
 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
 		return (NULL);
 
 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
 		return (NULL);
 
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
 	spa_activate(spa, SPA_MODE_READ);
 
 	/*
 	 * Rewind pool if a max txg was provided.
 	 */
 	zpool_get_load_policy(spa->spa_config, &policy);
 	if (policy.zlp_txg != UINT64_MAX) {
 		spa->spa_load_max_txg = policy.zlp_txg;
 		spa->spa_extreme_rewind = B_TRUE;
 		zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
 		    poolname, (longlong_t)policy.zlp_txg);
 	} else {
 		zfs_dbgmsg("spa_tryimport: importing %s", poolname);
 	}
 
 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
 	    == 0) {
 		zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
 		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
 	} else {
 		spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
 	}
 
 	/*
 	 * spa_import() relies on a pool config fetched by spa_try_import()
 	 * for spare/cache devices. Import flags are not passed to
 	 * spa_tryimport(), which makes it return early due to a missing log
 	 * device and missing retrieving the cache device and spare eventually.
 	 * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch
 	 * the correct configuration regardless of the missing log device.
 	 */
 	spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG;
 
 	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
 
 	/*
 	 * If 'tryconfig' was at least parsable, return the current config.
 	 */
 	if (spa->spa_root_vdev != NULL) {
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 		fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname);
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state);
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
 		    spa->spa_uberblock.ub_timestamp);
 		fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
 		    spa->spa_load_info);
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA,
 		    spa->spa_errata);
 
 		/*
 		 * If the bootfs property exists on this pool then we
 		 * copy it out so that external consumers can tell which
 		 * pools are bootable.
 		 */
 		if ((!error || error == EEXIST) && spa->spa_bootfs) {
 			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 			/*
 			 * We have to play games with the name since the
 			 * pool was opened as TRYIMPORT_NAME.
 			 */
 			if (dsl_dsobj_to_dsname(spa_name(spa),
 			    spa->spa_bootfs, tmpname) == 0) {
 				char *cp;
 				char *dsname;
 
 				dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 				cp = strchr(tmpname, '/');
 				if (cp == NULL) {
 					(void) strlcpy(dsname, tmpname,
 					    MAXPATHLEN);
 				} else {
 					(void) snprintf(dsname, MAXPATHLEN,
 					    "%s/%s", poolname, ++cp);
 				}
 				fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS,
 				    dsname);
 				kmem_free(dsname, MAXPATHLEN);
 			}
 			kmem_free(tmpname, MAXPATHLEN);
 		}
 
 		/*
 		 * Add the list of hot spares and level 2 cache devices.
 		 */
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_add_spares(spa, config);
 		spa_add_l2cache(spa, config);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 	spa_remove(spa);
 	mutex_exit(&spa_namespace_lock);
 
 	return (config);
 }
 
 /*
  * Pool export/destroy
  *
  * The act of destroying or exporting a pool is very simple.  We make sure there
  * is no more pending I/O and any references to the pool are gone.  Then, we
  * update the pool state and sync all the labels to disk, removing the
  * configuration from the cache afterwards. If the 'hardforce' flag is set, then
  * we don't sync the labels or remove the configuration cache.
  */
 static int
 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
     boolean_t force, boolean_t hardforce)
 {
 	int error;
 	spa_t *spa;
 
 	if (oldconfig)
 		*oldconfig = NULL;
 
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return (SET_ERROR(EROFS));
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(pool)) == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (spa->spa_is_exporting) {
 		/* the pool is being exported by another thread */
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS));
 	}
 	spa->spa_is_exporting = B_TRUE;
 
 	/*
 	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
 	 * reacquire the namespace lock, and see if we can export.
 	 */
 	spa_open_ref(spa, FTAG);
 	mutex_exit(&spa_namespace_lock);
 	spa_async_suspend(spa);
 	if (spa->spa_zvol_taskq) {
 		zvol_remove_minors(spa, spa_name(spa), B_TRUE);
 		taskq_wait(spa->spa_zvol_taskq);
 	}
 	mutex_enter(&spa_namespace_lock);
 	spa_close(spa, FTAG);
 
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
 		goto export_spa;
 	/*
 	 * The pool will be in core if it's openable, in which case we can
 	 * modify its state.  Objsets may be open only because they're dirty,
 	 * so we have to force it to sync before checking spa_refcnt.
 	 */
 	if (spa->spa_sync_on) {
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 		spa_evicting_os_wait(spa);
 	}
 
 	/*
 	 * A pool cannot be exported or destroyed if there are active
 	 * references.  If we are resetting a pool, allow references by
 	 * fault injection handlers.
 	 */
 	if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) {
 		error = SET_ERROR(EBUSY);
 		goto fail;
 	}
 
 	if (spa->spa_sync_on) {
 		vdev_t *rvd = spa->spa_root_vdev;
 		/*
 		 * A pool cannot be exported if it has an active shared spare.
 		 * This is to prevent other pools stealing the active spare
 		 * from an exported pool. At user's own will, such pool can
 		 * be forcedly exported.
 		 */
 		if (!force && new_state == POOL_STATE_EXPORTED &&
 		    spa_has_active_shared_spare(spa)) {
 			error = SET_ERROR(EXDEV);
 			goto fail;
 		}
 
 		/*
 		 * We're about to export or destroy this pool. Make sure
 		 * we stop all initialization and trim activity here before
 		 * we set the spa_final_txg. This will ensure that all
 		 * dirty data resulting from the initialization is
 		 * committed to disk before we unload the pool.
 		 */
 		vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
 		vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
 		vdev_autotrim_stop_all(spa);
 		vdev_rebuild_stop_all(spa);
 
 		/*
 		 * We want this to be reflected on every label,
 		 * so mark them all dirty.  spa_unload() will do the
 		 * final sync that pushes these changes out.
 		 */
 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 			spa->spa_state = new_state;
 			vdev_config_dirty(rvd);
 			spa_config_exit(spa, SCL_ALL, FTAG);
 		}
 
 		/*
 		 * If the log space map feature is enabled and the pool is
 		 * getting exported (but not destroyed), we want to spend some
 		 * time flushing as many metaslabs as we can in an attempt to
 		 * destroy log space maps and save import time. This has to be
 		 * done before we set the spa_final_txg, otherwise
 		 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs.
 		 * spa_should_flush_logs_on_unload() should be called after
 		 * spa_state has been set to the new_state.
 		 */
 		if (spa_should_flush_logs_on_unload(spa))
 			spa_unload_log_sm_flush_all(spa);
 
 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 			spa->spa_final_txg = spa_last_synced_txg(spa) +
 			    TXG_DEFER_SIZE + 1;
 			spa_config_exit(spa, SCL_ALL, FTAG);
 		}
 	}
 
 export_spa:
 	spa_export_os(spa);
 
 	if (new_state == POOL_STATE_DESTROYED)
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
 	else if (new_state == POOL_STATE_EXPORTED)
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT);
 
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 	}
 
 	if (oldconfig && spa->spa_config)
 		*oldconfig = fnvlist_dup(spa->spa_config);
 
 	if (new_state != POOL_STATE_UNINITIALIZED) {
 		if (!hardforce)
 			spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
 		spa_remove(spa);
 	} else {
 		/*
 		 * If spa_remove() is not called for this spa_t and
 		 * there is any possibility that it can be reused,
 		 * we make sure to reset the exporting flag.
 		 */
 		spa->spa_is_exporting = B_FALSE;
 	}
 
 	mutex_exit(&spa_namespace_lock);
 	return (0);
 
 fail:
 	spa->spa_is_exporting = B_FALSE;
 	spa_async_resume(spa);
 	mutex_exit(&spa_namespace_lock);
 	return (error);
 }
 
 /*
  * Destroy a storage pool.
  */
 int
 spa_destroy(const char *pool)
 {
 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
 	    B_FALSE, B_FALSE));
 }
 
 /*
  * Export a storage pool.
  */
 int
 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force,
     boolean_t hardforce)
 {
 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
 	    force, hardforce));
 }
 
 /*
  * Similar to spa_export(), this unloads the spa_t without actually removing it
  * from the namespace in any way.
  */
 int
 spa_reset(const char *pool)
 {
 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
 	    B_FALSE, B_FALSE));
 }
 
 /*
  * ==========================================================================
  * Device manipulation
  * ==========================================================================
  */
 
 /*
  * This is called as a synctask to increment the draid feature flag
  */
 static void
 spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	int draid = (int)(uintptr_t)arg;
 
 	for (int c = 0; c < draid; c++)
 		spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
 }
 
 /*
  * Add a device to a storage pool.
  */
 int
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 {
 	uint64_t txg, ndraid = 0;
 	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *tvd;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
 	    VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
 	    &nspares) != 0)
 		nspares = 0;
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
 	    &nl2cache) != 0)
 		nl2cache = 0;
 
 	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
 
 	if (vd->vdev_children != 0 &&
 	    (error = vdev_create(vd, txg, B_FALSE)) != 0) {
 		return (spa_vdev_exit(spa, vd, txg, error));
 	}
 
 	/*
 	 * The virtual dRAID spares must be added after vdev tree is created
 	 * and the vdev guids are generated.  The guid of their associated
 	 * dRAID is stored in the config and used when opening the spare.
 	 */
 	if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid,
 	    rvd->vdev_children)) == 0) {
 		if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)
 			nspares = 0;
 	} else {
 		return (spa_vdev_exit(spa, vd, txg, error));
 	}
 
 	/*
 	 * We must validate the spares and l2cache devices after checking the
 	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
 	 */
 	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, vd, txg, error));
 
 	/*
 	 * If we are in the middle of a device removal, we can only add
 	 * devices which match the existing devices in the pool.
 	 * If we are in the middle of a removal, or have some indirect
 	 * vdevs, we can not add raidz or dRAID top levels.
 	 */
 	if (spa->spa_vdev_removal != NULL ||
 	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			tvd = vd->vdev_child[c];
 			if (spa->spa_vdev_removal != NULL &&
 			    tvd->vdev_ashift != spa->spa_max_ashift) {
 				return (spa_vdev_exit(spa, vd, txg, EINVAL));
 			}
 			/* Fail if top level vdev is raidz or a dRAID */
 			if (vdev_get_nparity(tvd) != 0)
 				return (spa_vdev_exit(spa, vd, txg, EINVAL));
 
 			/*
 			 * Need the top level mirror to be
 			 * a mirror of leaf vdevs only
 			 */
 			if (tvd->vdev_ops == &vdev_mirror_ops) {
 				for (uint64_t cid = 0;
 				    cid < tvd->vdev_children; cid++) {
 					vdev_t *cvd = tvd->vdev_child[cid];
 					if (!cvd->vdev_ops->vdev_op_leaf) {
 						return (spa_vdev_exit(spa, vd,
 						    txg, EINVAL));
 					}
 				}
 			}
 		}
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		tvd = vd->vdev_child[c];
 		vdev_remove_child(vd, tvd);
 		tvd->vdev_id = rvd->vdev_children;
 		vdev_add_child(rvd, tvd);
 		vdev_config_dirty(tvd);
 	}
 
 	if (nspares != 0) {
 		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
 		    ZPOOL_CONFIG_SPARES);
 		spa_load_spares(spa);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	if (nl2cache != 0) {
 		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
 		    ZPOOL_CONFIG_L2CACHE);
 		spa_load_l2cache(spa);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * We can't increment a feature while holding spa_vdev so we
 	 * have to do it in a synctask.
 	 */
 	if (ndraid != 0) {
 		dmu_tx_t *tx;
 
 		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 		dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr,
 		    (void *)(uintptr_t)ndraid, tx);
 		dmu_tx_commit(tx);
 	}
 
 	/*
 	 * We have to be careful when adding new vdevs to an existing pool.
 	 * If other threads start allocating from these vdevs before we
 	 * sync the config cache, and we lose power, then upon reboot we may
 	 * fail to open the pool because there are DVAs that the config cache
 	 * can't translate.  Therefore, we first add the vdevs without
 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
 	 * and then let spa_config_update() initialize the new metaslabs.
 	 *
 	 * spa_load() checks for added-but-not-initialized vdevs, so that
 	 * if we lose power at any point in this sequence, the remaining
 	 * steps will be completed the next time we load the pool.
 	 */
 	(void) spa_vdev_exit(spa, vd, txg, 0);
 
 	mutex_enter(&spa_namespace_lock);
 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 /*
  * Attach a device to a vdev specified by its guid.  The vdev type can be
  * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a
  * single device). When the vdev is a single device, a mirror vdev will be
  * automatically inserted.
  *
  * If 'replacing' is specified, the new device is intended to replace the
  * existing device; in this case the two devices are made into their own
  * mirror using the 'replacing' vdev, which is functionally identical to
  * the mirror vdev (it actually reuses all the same ops) but has a few
  * extra rules: you can't attach to it after it's been created, and upon
  * completion of resilvering, the first disk (the one being replaced)
  * is automatically detached.
  *
  * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild)
  * should be performed instead of traditional healing reconstruction.  From
  * an administrators perspective these are both resilver operations.
  */
 int
 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
     int rebuild)
 {
 	uint64_t txg, dtl_max_txg;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
 	vdev_ops_t *pvops;
 	char *oldvdpath, *newvdpath;
 	int newvd_isspare = B_FALSE;
 	int error;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	if (rebuild) {
 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 		if (dsl_scan_resilvering(spa_get_dsl(spa)) ||
 		    dsl_scan_resilver_scheduled(spa_get_dsl(spa))) {
 			return (spa_vdev_exit(spa, NULL, txg,
 			    ZFS_ERR_RESILVER_IN_PROGRESS));
 		}
 	} else {
 		if (vdev_rebuild_active(rvd))
 			return (spa_vdev_exit(spa, NULL, txg,
 			    ZFS_ERR_REBUILD_IN_PROGRESS));
 	}
 
 	if (spa->spa_vdev_removal != NULL) {
 		return (spa_vdev_exit(spa, NULL, txg,
 		    ZFS_ERR_DEVRM_IN_PROGRESS));
 	}
 
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops;
 
 	if (raidz) {
 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION))
 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 		/*
 		 * Can't expand a raidz while prior expand is in progress.
 		 */
 		if (spa->spa_raidz_expand != NULL) {
 			return (spa_vdev_exit(spa, NULL, txg,
 			    ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));
 		}
 	} else if (!oldvd->vdev_ops->vdev_op_leaf) {
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 	}
 
 	if (raidz)
 		pvd = oldvd;
 	else
 		pvd = oldvd->vdev_parent;
 
 	if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
 	    VDEV_ALLOC_ATTACH) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	if (newrootvd->vdev_children != 1)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	newvd = newrootvd->vdev_child[0];
 
 	if (!newvd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
 		return (spa_vdev_exit(spa, newrootvd, txg, error));
 
 	/*
 	 * log, dedup and special vdevs should not be replaced by spares.
 	 */
 	if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE ||
 	    oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) {
 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 	}
 
 	/*
 	 * A dRAID spare can only replace a child of its parent dRAID vdev.
 	 */
 	if (newvd->vdev_ops == &vdev_draid_spare_ops &&
 	    oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) {
 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 	}
 
 	if (rebuild) {
 		/*
 		 * For rebuilds, the top vdev must support reconstruction
 		 * using only space maps.  This means the only allowable
 		 * vdevs types are the root vdev, a mirror, or dRAID.
 		 */
 		tvd = pvd;
 		if (pvd->vdev_top != NULL)
 			tvd = pvd->vdev_top;
 
 		if (tvd->vdev_ops != &vdev_mirror_ops &&
 		    tvd->vdev_ops != &vdev_root_ops &&
 		    tvd->vdev_ops != &vdev_draid_ops) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		}
 	}
 
 	if (!replacing) {
 		/*
 		 * For attach, the only allowable parent is a mirror or
 		 * the root vdev. A raidz vdev can be attached to, but
 		 * you cannot attach to a raidz child.
 		 */
 		if (pvd->vdev_ops != &vdev_mirror_ops &&
 		    pvd->vdev_ops != &vdev_root_ops &&
 		    !raidz)
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		pvops = &vdev_mirror_ops;
 	} else {
 		/*
 		 * Active hot spares can only be replaced by inactive hot
 		 * spares.
 		 */
 		if (pvd->vdev_ops == &vdev_spare_ops &&
 		    oldvd->vdev_isspare &&
 		    !spa_has_spare(spa, newvd->vdev_guid))
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		/*
 		 * If the source is a hot spare, and the parent isn't already a
 		 * spare, then we want to create a new hot spare.  Otherwise, we
 		 * want to create a replacing vdev.  The user is not allowed to
 		 * attach to a spared vdev child unless the 'isspare' state is
 		 * the same (spare replaces spare, non-spare replaces
 		 * non-spare).
 		 */
 		if (pvd->vdev_ops == &vdev_replacing_ops &&
 		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		} else if (pvd->vdev_ops == &vdev_spare_ops &&
 		    newvd->vdev_isspare != oldvd->vdev_isspare) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		}
 
 		if (newvd->vdev_isspare)
 			pvops = &vdev_spare_ops;
 		else
 			pvops = &vdev_replacing_ops;
 	}
 
 	/*
 	 * Make sure the new device is big enough.
 	 */
 	vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd;
 	if (newvd->vdev_asize < vdev_get_min_asize(min_vdev))
 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
 
 	/*
 	 * The new device cannot have a higher alignment requirement
 	 * than the top-level vdev.
 	 */
 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 	/*
 	 * RAIDZ-expansion-specific checks.
 	 */
 	if (raidz) {
 		if (vdev_raidz_attach_check(newvd) != 0)
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		/*
 		 * Fail early if a child is not healthy or being replaced
 		 */
 		for (int i = 0; i < oldvd->vdev_children; i++) {
 			if (vdev_is_dead(oldvd->vdev_child[i]) ||
 			    !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) {
 				return (spa_vdev_exit(spa, newrootvd, txg,
 				    ENXIO));
 			}
 			/* Also fail if reserved boot area is in-use */
 			if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i])
 			    != 0) {
 				return (spa_vdev_exit(spa, newrootvd, txg,
 				    EADDRINUSE));
 			}
 		}
 	}
 
 	if (raidz) {
 		/*
 		 * Note: oldvdpath is freed by spa_strfree(),  but
 		 * kmem_asprintf() is freed by kmem_strfree(), so we have to
 		 * move it to a spa_strdup-ed string.
 		 */
 		char *tmp = kmem_asprintf("raidz%u-%u",
 		    (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id);
 		oldvdpath = spa_strdup(tmp);
 		kmem_strfree(tmp);
 	} else {
 		oldvdpath = spa_strdup(oldvd->vdev_path);
 	}
 	newvdpath = spa_strdup(newvd->vdev_path);
 
 	/*
 	 * If this is an in-place replacement, update oldvd's path and devid
 	 * to make it distinguishable from newvd, and unopenable from now on.
 	 */
 	if (strcmp(oldvdpath, newvdpath) == 0) {
 		spa_strfree(oldvd->vdev_path);
 		oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5,
 		    KM_SLEEP);
 		(void) sprintf(oldvd->vdev_path, "%s/old",
 		    newvdpath);
 		if (oldvd->vdev_devid != NULL) {
 			spa_strfree(oldvd->vdev_devid);
 			oldvd->vdev_devid = NULL;
 		}
 		spa_strfree(oldvdpath);
 		oldvdpath = spa_strdup(oldvd->vdev_path);
 	}
 
 	/*
 	 * If the parent is not a mirror, or if we're replacing, insert the new
 	 * mirror/replacing/spare vdev above oldvd.
 	 */
 	if (!raidz && pvd->vdev_ops != pvops) {
 		pvd = vdev_add_parent(oldvd, pvops);
 		ASSERT(pvd->vdev_ops == pvops);
 		ASSERT(oldvd->vdev_parent == pvd);
 	}
 
 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
 
 	/*
 	 * Extract the new device from its root and add it to pvd.
 	 */
 	vdev_remove_child(newrootvd, newvd);
 	newvd->vdev_id = pvd->vdev_children;
 	newvd->vdev_crtxg = oldvd->vdev_crtxg;
 	vdev_add_child(pvd, newvd);
 
 	/*
 	 * Reevaluate the parent vdev state.
 	 */
 	vdev_propagate_state(pvd);
 
 	tvd = newvd->vdev_top;
 	ASSERT(pvd->vdev_top == tvd);
 	ASSERT(tvd->vdev_parent == rvd);
 
 	vdev_config_dirty(tvd);
 
 	/*
 	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
 	 * for any dmu_sync-ed blocks.  It will propagate upward when
 	 * spa_vdev_exit() calls vdev_dtl_reassess().
 	 */
 	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 
 	if (raidz) {
 		/*
 		 * Wait for the youngest allocations and frees to sync,
 		 * and then wait for the deferral of those frees to finish.
 		 */
 		spa_vdev_config_exit(spa, NULL,
 		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 
 		vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE);
 		vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE);
 		vdev_autotrim_stop_wait(tvd);
 
 		dtl_max_txg = spa_vdev_config_enter(spa);
 
 		tvd->vdev_rz_expanding = B_TRUE;
 
 		vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg);
 		vdev_config_dirty(tvd);
 
 		dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
 		    dtl_max_txg);
 		dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync,
 		    newvd, tx);
 		dmu_tx_commit(tx);
 	} else {
 		vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
 		    dtl_max_txg - TXG_INITIAL);
 
 		if (newvd->vdev_isspare) {
 			spa_spare_activate(newvd);
 			spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
 		}
 
 		newvd_isspare = newvd->vdev_isspare;
 
 		/*
 		 * Mark newvd's DTL dirty in this txg.
 		 */
 		vdev_dirty(tvd, VDD_DTL, newvd, txg);
 
 		/*
 		 * Schedule the resilver or rebuild to restart in the future.
 		 * We do this to ensure that dmu_sync-ed blocks have been
 		 * stitched into the respective datasets.
 		 */
 		if (rebuild) {
 			newvd->vdev_rebuild_txg = txg;
 
 			vdev_rebuild(tvd);
 		} else {
 			newvd->vdev_resilver_txg = txg;
 
 			if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
 			    spa_feature_is_enabled(spa,
 			    SPA_FEATURE_RESILVER_DEFER)) {
 				vdev_defer_resilver(newvd);
 			} else {
 				dsl_scan_restart_resilver(spa->spa_dsl_pool,
 				    dtl_max_txg);
 			}
 		}
 	}
 
 	if (spa->spa_bootfs)
 		spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
 
 	spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
 
 	/*
 	 * Commit the config
 	 */
 	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
 
 	spa_history_log_internal(spa, "vdev attach", NULL,
 	    "%s vdev=%s %s vdev=%s",
 	    replacing && newvd_isspare ? "spare in" :
 	    replacing ? "replace" : "attach", newvdpath,
 	    replacing ? "for" : "to", oldvdpath);
 
 	spa_strfree(oldvdpath);
 	spa_strfree(newvdpath);
 
 	return (0);
 }
 
 /*
  * Detach a device from a mirror or replacing vdev.
  *
  * If 'replace_done' is specified, only detach if the parent
  * is a replacing or a spare vdev.
  */
 int
 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 {
 	uint64_t txg;
 	int error;
 	vdev_t *rvd __maybe_unused = spa->spa_root_vdev;
 	vdev_t *vd, *pvd, *cvd, *tvd;
 	boolean_t unspare = B_FALSE;
 	uint64_t unspare_guid = 0;
 	char *vdpath;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_detach_enter(spa, guid);
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	/*
 	 * Besides being called directly from the userland through the
 	 * ioctl interface, spa_vdev_detach() can be potentially called
 	 * at the end of spa_vdev_resilver_done().
 	 *
 	 * In the regular case, when we have a checkpoint this shouldn't
 	 * happen as we never empty the DTLs of a vdev during the scrub
 	 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
 	 * should never get here when we have a checkpoint.
 	 *
 	 * That said, even in a case when we checkpoint the pool exactly
 	 * as spa_vdev_resilver_done() calls this function everything
 	 * should be fine as the resilver will return right away.
 	 */
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	if (vd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	pvd = vd->vdev_parent;
 
 	/*
 	 * If the parent/child relationship is not as expected, don't do it.
 	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
 	 * vdev that's replacing B with C.  The user's intent in replacing
 	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
 	 * the replace by detaching C, the expected behavior is to end up
 	 * M(A,B).  But suppose that right after deciding to detach C,
 	 * the replacement of B completes.  We would have M(A,C), and then
 	 * ask to detach C, which would leave us with just A -- not what
 	 * the user wanted.  To prevent this, we make sure that the
 	 * parent/child relationship hasn't changed -- in this example,
 	 * that C's parent is still the replacing vdev R.
 	 */
 	if (pvd->vdev_guid != pguid && pguid != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	/*
 	 * Only 'replacing' or 'spare' vdevs can be replaced.
 	 */
 	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
 	    pvd->vdev_ops != &vdev_spare_ops)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
 	    spa_version(spa) >= SPA_VERSION_SPARES);
 
 	/*
 	 * Only mirror, replacing, and spare vdevs support detach.
 	 */
 	if (pvd->vdev_ops != &vdev_replacing_ops &&
 	    pvd->vdev_ops != &vdev_mirror_ops &&
 	    pvd->vdev_ops != &vdev_spare_ops)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	/*
 	 * If this device has the only valid copy of some data,
 	 * we cannot safely detach it.
 	 */
 	if (vdev_dtl_required(vd))
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	ASSERT(pvd->vdev_children >= 2);
 
 	/*
 	 * If we are detaching the second disk from a replacing vdev, then
 	 * check to see if we changed the original vdev's path to have "/old"
 	 * at the end in spa_vdev_attach().  If so, undo that change now.
 	 */
 	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
 	    vd->vdev_path != NULL) {
 		size_t len = strlen(vd->vdev_path);
 
 		for (int c = 0; c < pvd->vdev_children; c++) {
 			cvd = pvd->vdev_child[c];
 
 			if (cvd == vd || cvd->vdev_path == NULL)
 				continue;
 
 			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
 			    strcmp(cvd->vdev_path + len, "/old") == 0) {
 				spa_strfree(cvd->vdev_path);
 				cvd->vdev_path = spa_strdup(vd->vdev_path);
 				break;
 			}
 		}
 	}
 
 	/*
 	 * If we are detaching the original disk from a normal spare, then it
 	 * implies that the spare should become a real disk, and be removed
 	 * from the active spare list for the pool.  dRAID spares on the
 	 * other hand are coupled to the pool and thus should never be removed
 	 * from the spares list.
 	 */
 	if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) {
 		vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1];
 
 		if (last_cvd->vdev_isspare &&
 		    last_cvd->vdev_ops != &vdev_draid_spare_ops) {
 			unspare = B_TRUE;
 		}
 	}
 
 	/*
 	 * Erase the disk labels so the disk can be used for other things.
 	 * This must be done after all other error cases are handled,
 	 * but before we disembowel vd (so we can still do I/O to it).
 	 * But if we can't do it, don't treat the error as fatal --
 	 * it may be that the unwritability of the disk is the reason
 	 * it's being detached!
 	 */
 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	/*
 	 * Remove vd from its parent and compact the parent's children.
 	 */
 	vdev_remove_child(pvd, vd);
 	vdev_compact_children(pvd);
 
 	/*
 	 * Remember one of the remaining children so we can get tvd below.
 	 */
 	cvd = pvd->vdev_child[pvd->vdev_children - 1];
 
 	/*
 	 * If we need to remove the remaining child from the list of hot spares,
 	 * do it now, marking the vdev as no longer a spare in the process.
 	 * We must do this before vdev_remove_parent(), because that can
 	 * change the GUID if it creates a new toplevel GUID.  For a similar
 	 * reason, we must remove the spare now, in the same txg as the detach;
 	 * otherwise someone could attach a new sibling, change the GUID, and
 	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
 	 */
 	if (unspare) {
 		ASSERT(cvd->vdev_isspare);
 		spa_spare_remove(cvd);
 		unspare_guid = cvd->vdev_guid;
 		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
 		cvd->vdev_unspare = B_TRUE;
 	}
 
 	/*
 	 * If the parent mirror/replacing vdev only has one child,
 	 * the parent is no longer needed.  Remove it from the tree.
 	 */
 	if (pvd->vdev_children == 1) {
 		if (pvd->vdev_ops == &vdev_spare_ops)
 			cvd->vdev_unspare = B_FALSE;
 		vdev_remove_parent(cvd);
 	}
 
 	/*
 	 * We don't set tvd until now because the parent we just removed
 	 * may have been the previous top-level vdev.
 	 */
 	tvd = cvd->vdev_top;
 	ASSERT(tvd->vdev_parent == rvd);
 
 	/*
 	 * Reevaluate the parent vdev state.
 	 */
 	vdev_propagate_state(cvd);
 
 	/*
 	 * If the 'autoexpand' property is set on the pool then automatically
 	 * try to expand the size of the pool. For example if the device we
 	 * just detached was smaller than the others, it may be possible to
 	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
 	 * first so that we can obtain the updated sizes of the leaf vdevs.
 	 */
 	if (spa->spa_autoexpand) {
 		vdev_reopen(tvd);
 		vdev_expand(tvd, txg);
 	}
 
 	vdev_config_dirty(tvd);
 
 	/*
 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
 	 * But first make sure we're not on any *other* txg's DTL list, to
 	 * prevent vd from being accessed after it's freed.
 	 */
 	vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none");
 	for (int t = 0; t < TXG_SIZE; t++)
 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
 	vd->vdev_detached = B_TRUE;
 	vdev_dirty(tvd, VDD_DTL, vd, txg);
 
 	spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
 	spa_notify_waiters(spa);
 
 	/* hang on to the spa before we release the lock */
 	spa_open_ref(spa, FTAG);
 
 	error = spa_vdev_exit(spa, vd, txg, 0);
 
 	spa_history_log_internal(spa, "detach", NULL,
 	    "vdev=%s", vdpath);
 	spa_strfree(vdpath);
 
 	/*
 	 * If this was the removal of the original device in a hot spare vdev,
 	 * then we want to go through and remove the device from the hot spare
 	 * list of every other pool.
 	 */
 	if (unspare) {
 		spa_t *altspa = NULL;
 
 		mutex_enter(&spa_namespace_lock);
 		while ((altspa = spa_next(altspa)) != NULL) {
 			if (altspa->spa_state != POOL_STATE_ACTIVE ||
 			    altspa == spa)
 				continue;
 
 			spa_open_ref(altspa, FTAG);
 			mutex_exit(&spa_namespace_lock);
 			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
 			mutex_enter(&spa_namespace_lock);
 			spa_close(altspa, FTAG);
 		}
 		mutex_exit(&spa_namespace_lock);
 
 		/* search the rest of the vdevs for spares to remove */
 		spa_vdev_resilver_done(spa);
 	}
 
 	/* all done with the spa; OK to release */
 	mutex_enter(&spa_namespace_lock);
 	spa_close(spa, FTAG);
 	mutex_exit(&spa_namespace_lock);
 
 	return (error);
 }
 
 static int
 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
     list_t *vd_list)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 
 	/* Look up vdev and ensure it's a leaf. */
 	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 	if (vd == NULL || vd->vdev_detached) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(ENODEV));
 	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EINVAL));
 	} else if (!vdev_writeable(vd)) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EROFS));
 	}
 	mutex_enter(&vd->vdev_initialize_lock);
 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 
 	/*
 	 * When we activate an initialize action we check to see
 	 * if the vdev_initialize_thread is NULL. We do this instead
 	 * of using the vdev_initialize_state since there might be
 	 * a previous initialization process which has completed but
 	 * the thread is not exited.
 	 */
 	if (cmd_type == POOL_INITIALIZE_START &&
 	    (vd->vdev_initialize_thread != NULL ||
 	    vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(EBUSY));
 	} else if (cmd_type == POOL_INITIALIZE_CANCEL &&
 	    (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
 	    vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(ESRCH));
 	} else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
 	    vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(ESRCH));
 	} else if (cmd_type == POOL_INITIALIZE_UNINIT &&
 	    vd->vdev_initialize_thread != NULL) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(EBUSY));
 	}
 
 	switch (cmd_type) {
 	case POOL_INITIALIZE_START:
 		vdev_initialize(vd);
 		break;
 	case POOL_INITIALIZE_CANCEL:
 		vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list);
 		break;
 	case POOL_INITIALIZE_SUSPEND:
 		vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
 		break;
 	case POOL_INITIALIZE_UNINIT:
 		vdev_uninitialize(vd);
 		break;
 	default:
 		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
 	}
 	mutex_exit(&vd->vdev_initialize_lock);
 
 	return (0);
 }
 
 int
 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
     nvlist_t *vdev_errlist)
 {
 	int total_errors = 0;
 	list_t vd_list;
 
 	list_create(&vd_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_initialize_node));
 
 	/*
 	 * We hold the namespace lock through the whole function
 	 * to prevent any changes to the pool while we're starting or
 	 * stopping initialization. The config and state locks are held so that
 	 * we can properly assess the vdev state before we commit to
 	 * the initializing operation.
 	 */
 	mutex_enter(&spa_namespace_lock);
 
 	for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
 		uint64_t vdev_guid = fnvpair_value_uint64(pair);
 
 		int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type,
 		    &vd_list);
 		if (error != 0) {
 			char guid_as_str[MAXNAMELEN];
 
 			(void) snprintf(guid_as_str, sizeof (guid_as_str),
 			    "%llu", (unsigned long long)vdev_guid);
 			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
 			total_errors++;
 		}
 	}
 
 	/* Wait for all initialize threads to stop. */
 	vdev_initialize_stop_wait(spa, &vd_list);
 
 	/* Sync out the initializing state */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 	mutex_exit(&spa_namespace_lock);
 
 	list_destroy(&vd_list);
 
 	return (total_errors);
 }
 
 static int
 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
     uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 
 	/* Look up vdev and ensure it's a leaf. */
 	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 	if (vd == NULL || vd->vdev_detached) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(ENODEV));
 	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EINVAL));
 	} else if (!vdev_writeable(vd)) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EROFS));
 	} else if (!vd->vdev_has_trim) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EOPNOTSUPP));
 	} else if (secure && !vd->vdev_has_securetrim) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EOPNOTSUPP));
 	}
 	mutex_enter(&vd->vdev_trim_lock);
 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 
 	/*
 	 * When we activate a TRIM action we check to see if the
 	 * vdev_trim_thread is NULL. We do this instead of using the
 	 * vdev_trim_state since there might be a previous TRIM process
 	 * which has completed but the thread is not exited.
 	 */
 	if (cmd_type == POOL_TRIM_START &&
 	    (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing ||
 	    vd->vdev_top->vdev_rz_expanding)) {
 		mutex_exit(&vd->vdev_trim_lock);
 		return (SET_ERROR(EBUSY));
 	} else if (cmd_type == POOL_TRIM_CANCEL &&
 	    (vd->vdev_trim_state != VDEV_TRIM_ACTIVE &&
 	    vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) {
 		mutex_exit(&vd->vdev_trim_lock);
 		return (SET_ERROR(ESRCH));
 	} else if (cmd_type == POOL_TRIM_SUSPEND &&
 	    vd->vdev_trim_state != VDEV_TRIM_ACTIVE) {
 		mutex_exit(&vd->vdev_trim_lock);
 		return (SET_ERROR(ESRCH));
 	}
 
 	switch (cmd_type) {
 	case POOL_TRIM_START:
 		vdev_trim(vd, rate, partial, secure);
 		break;
 	case POOL_TRIM_CANCEL:
 		vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list);
 		break;
 	case POOL_TRIM_SUSPEND:
 		vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list);
 		break;
 	default:
 		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
 	}
 	mutex_exit(&vd->vdev_trim_lock);
 
 	return (0);
 }
 
 /*
  * Initiates a manual TRIM for the requested vdevs. This kicks off individual
  * TRIM threads for each child vdev.  These threads pass over all of the free
  * space in the vdev's metaslabs and issues TRIM commands for that space.
  */
 int
 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
     boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist)
 {
 	int total_errors = 0;
 	list_t vd_list;
 
 	list_create(&vd_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_trim_node));
 
 	/*
 	 * We hold the namespace lock through the whole function
 	 * to prevent any changes to the pool while we're starting or
 	 * stopping TRIM. The config and state locks are held so that
 	 * we can properly assess the vdev state before we commit to
 	 * the TRIM operation.
 	 */
 	mutex_enter(&spa_namespace_lock);
 
 	for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
 		uint64_t vdev_guid = fnvpair_value_uint64(pair);
 
 		int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type,
 		    rate, partial, secure, &vd_list);
 		if (error != 0) {
 			char guid_as_str[MAXNAMELEN];
 
 			(void) snprintf(guid_as_str, sizeof (guid_as_str),
 			    "%llu", (unsigned long long)vdev_guid);
 			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
 			total_errors++;
 		}
 	}
 
 	/* Wait for all TRIM threads to stop. */
 	vdev_trim_stop_wait(spa, &vd_list);
 
 	/* Sync out the TRIM state */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 	mutex_exit(&spa_namespace_lock);
 
 	list_destroy(&vd_list);
 
 	return (total_errors);
 }
 
 /*
  * Split a set of devices from their mirrors, and create a new pool from them.
  */
 int
 spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config,
     nvlist_t *props, boolean_t exp)
 {
 	int error = 0;
 	uint64_t txg, *glist;
 	spa_t *newspa;
 	uint_t c, children, lastlog;
 	nvlist_t **child, *nvl, *tmp;
 	dmu_tx_t *tx;
 	const char *altroot = NULL;
 	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
 	boolean_t activate_slog;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	/* clear the log and flush everything up to now */
 	activate_slog = spa_passivate_log(spa);
 	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 	error = spa_reset_logs(spa);
 	txg = spa_vdev_config_enter(spa);
 
 	if (activate_slog)
 		spa_activate_log(spa);
 
 	if (error != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	/* check new spa name before going any further */
 	if (spa_lookup(newname) != NULL)
 		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
 
 	/*
 	 * scan through all the children to ensure they're all mirrors
 	 */
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
 	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
 	    &children) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	/* first, check to ensure we've got the right child count */
 	rvd = spa->spa_root_vdev;
 	lastlog = 0;
 	for (c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		/* don't count the holes & logs as children */
 		if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops &&
 		    !vdev_is_concrete(vd))) {
 			if (lastlog == 0)
 				lastlog = c;
 			continue;
 		}
 
 		lastlog = 0;
 	}
 	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	/* next, ensure no spare or cache devices are part of the split */
 	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
 	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
 	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
 
 	/* then, loop over each vdev and validate it */
 	for (c = 0; c < children; c++) {
 		uint64_t is_hole = 0;
 
 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
 		    &is_hole);
 
 		if (is_hole != 0) {
 			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
 			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
 				continue;
 			} else {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 		}
 
 		/* deal with indirect vdevs */
 		if (spa->spa_root_vdev->vdev_child[c]->vdev_ops ==
 		    &vdev_indirect_ops)
 			continue;
 
 		/* which disk is going to be split? */
 		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
 		    &glist[c]) != 0) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		/* look it up in the spa */
 		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
 		if (vml[c] == NULL) {
 			error = SET_ERROR(ENODEV);
 			break;
 		}
 
 		/* make sure there's nothing stopping the split */
 		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
 		    vml[c]->vdev_islog ||
 		    !vdev_is_concrete(vml[c]) ||
 		    vml[c]->vdev_isspare ||
 		    vml[c]->vdev_isl2cache ||
 		    !vdev_writeable(vml[c]) ||
 		    vml[c]->vdev_children != 0 ||
 		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
 		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		if (vdev_dtl_required(vml[c]) ||
 		    vdev_resilver_needed(vml[c], NULL, NULL)) {
 			error = SET_ERROR(EBUSY);
 			break;
 		}
 
 		/* we need certain info from the top level */
 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
 		    vml[c]->vdev_top->vdev_ms_array);
 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
 		    vml[c]->vdev_top->vdev_ms_shift);
 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
 		    vml[c]->vdev_top->vdev_asize);
 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
 		    vml[c]->vdev_top->vdev_ashift);
 
 		/* transfer per-vdev ZAPs */
 		ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
 		VERIFY0(nvlist_add_uint64(child[c],
 		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
 
 		ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
 		VERIFY0(nvlist_add_uint64(child[c],
 		    ZPOOL_CONFIG_VDEV_TOP_ZAP,
 		    vml[c]->vdev_parent->vdev_top_zap));
 	}
 
 	if (error != 0) {
 		kmem_free(vml, children * sizeof (vdev_t *));
 		kmem_free(glist, children * sizeof (uint64_t));
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	/* stop writers from using the disks */
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_TRUE;
 	}
 	vdev_reopen(spa->spa_root_vdev);
 
 	/*
 	 * Temporarily record the splitting vdevs in the spa config.  This
 	 * will disappear once the config is regenerated.
 	 */
 	nvl = fnvlist_alloc();
 	fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children);
 	kmem_free(glist, children * sizeof (uint64_t));
 
 	mutex_enter(&spa->spa_props_lock);
 	fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl);
 	mutex_exit(&spa->spa_props_lock);
 	spa->spa_config_splitting = nvl;
 	vdev_config_dirty(spa->spa_root_vdev);
 
 	/* configure and create the new pool */
 	fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname);
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE);
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg);
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    spa_generate_guid(NULL));
 	VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 
 	/* add the new pool to the namespace */
 	newspa = spa_add(newname, config, altroot);
 	newspa->spa_avz_action = AVZ_ACTION_REBUILD;
 	newspa->spa_config_txg = spa->spa_config_txg;
 	spa_set_log_state(newspa, SPA_LOG_CLEAR);
 
 	/* release the spa config lock, retaining the namespace lock */
 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 1);
 
 	spa_activate(newspa, spa_mode_global);
 	spa_async_suspend(newspa);
 
 	/*
 	 * Temporarily stop the initializing and TRIM activity.  We set the
 	 * state to ACTIVE so that we know to resume initializing or TRIM
 	 * once the split has completed.
 	 */
 	list_t vd_initialize_list;
 	list_create(&vd_initialize_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_initialize_node));
 
 	list_t vd_trim_list;
 	list_create(&vd_trim_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_trim_node));
 
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
 			mutex_enter(&vml[c]->vdev_initialize_lock);
 			vdev_initialize_stop(vml[c],
 			    VDEV_INITIALIZE_ACTIVE, &vd_initialize_list);
 			mutex_exit(&vml[c]->vdev_initialize_lock);
 
 			mutex_enter(&vml[c]->vdev_trim_lock);
 			vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list);
 			mutex_exit(&vml[c]->vdev_trim_lock);
 		}
 	}
 
 	vdev_initialize_stop_wait(spa, &vd_initialize_list);
 	vdev_trim_stop_wait(spa, &vd_trim_list);
 
 	list_destroy(&vd_initialize_list);
 	list_destroy(&vd_trim_list);
 
 	newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
 	newspa->spa_is_splitting = B_TRUE;
 
 	/* create the new pool from the disks of the original pool */
 	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
 	if (error)
 		goto out;
 
 	/* if that worked, generate a real config for the new pool */
 	if (newspa->spa_root_vdev != NULL) {
 		newspa->spa_config_splitting = fnvlist_alloc();
 		fnvlist_add_uint64(newspa->spa_config_splitting,
 		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa));
 		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
 		    B_TRUE));
 	}
 
 	/* set the props */
 	if (props != NULL) {
 		spa_configfile_set(newspa, props, B_FALSE);
 		error = spa_prop_set(newspa, props);
 		if (error)
 			goto out;
 	}
 
 	/* flush everything */
 	txg = spa_vdev_config_enter(newspa);
 	vdev_config_dirty(newspa->spa_root_vdev);
 	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 2);
 
 	spa_async_resume(newspa);
 
 	/* finally, update the original pool's config */
 	txg = spa_vdev_config_enter(spa);
 	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error != 0)
 		dmu_tx_abort(tx);
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
 			vdev_t *tvd = vml[c]->vdev_top;
 
 			/*
 			 * Need to be sure the detachable VDEV is not
 			 * on any *other* txg's DTL list to prevent it
 			 * from being accessed after it's freed.
 			 */
 			for (int t = 0; t < TXG_SIZE; t++) {
 				(void) txg_list_remove_this(
 				    &tvd->vdev_dtl_list, vml[c], t);
 			}
 
 			vdev_split(vml[c]);
 			if (error == 0)
 				spa_history_log_internal(spa, "detach", tx,
 				    "vdev=%s", vml[c]->vdev_path);
 
 			vdev_free(vml[c]);
 		}
 	}
 	spa->spa_avz_action = AVZ_ACTION_REBUILD;
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa->spa_config_splitting = NULL;
 	nvlist_free(nvl);
 	if (error == 0)
 		dmu_tx_commit(tx);
 	(void) spa_vdev_exit(spa, NULL, txg, 0);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 3);
 
 	/* split is complete; log a history record */
 	spa_history_log_internal(newspa, "split", NULL,
 	    "from pool %s", spa_name(spa));
 
 	newspa->spa_is_splitting = B_FALSE;
 	kmem_free(vml, children * sizeof (vdev_t *));
 
 	/* if we're not going to mount the filesystems in userland, export */
 	if (exp)
 		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
 		    B_FALSE, B_FALSE);
 
 	return (error);
 
 out:
 	spa_unload(newspa);
 	spa_deactivate(newspa);
 	spa_remove(newspa);
 
 	txg = spa_vdev_config_enter(spa);
 
 	/* re-online all offlined disks */
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_FALSE;
 	}
 
 	/* restart initializing or trimming disks as necessary */
 	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
 	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
 	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
 
 	vdev_reopen(spa->spa_root_vdev);
 
 	nvlist_free(spa->spa_config_splitting);
 	spa->spa_config_splitting = NULL;
 	(void) spa_vdev_exit(spa, NULL, txg, error);
 
 	kmem_free(vml, children * sizeof (vdev_t *));
 	return (error);
 }
 
 /*
  * Find any device that's done replacing, or a vdev marked 'unspare' that's
  * currently spared, so we can detach it.
  */
 static vdev_t *
 spa_vdev_resilver_done_hunt(vdev_t *vd)
 {
 	vdev_t *newvd, *oldvd;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
 		if (oldvd != NULL)
 			return (oldvd);
 	}
 
 	/*
 	 * Check for a completed replacement.  We always consider the first
 	 * vdev in the list to be the oldest vdev, and the last one to be
 	 * the newest (see spa_vdev_attach() for how that works).  In
 	 * the case where the newest vdev is faulted, we will not automatically
 	 * remove it after a resilver completes.  This is OK as it will require
 	 * user intervention to determine which disk the admin wishes to keep.
 	 */
 	if (vd->vdev_ops == &vdev_replacing_ops) {
 		ASSERT(vd->vdev_children > 1);
 
 		newvd = vd->vdev_child[vd->vdev_children - 1];
 		oldvd = vd->vdev_child[0];
 
 		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 		    !vdev_dtl_required(oldvd))
 			return (oldvd);
 	}
 
 	/*
 	 * Check for a completed resilver with the 'unspare' flag set.
 	 * Also potentially update faulted state.
 	 */
 	if (vd->vdev_ops == &vdev_spare_ops) {
 		vdev_t *first = vd->vdev_child[0];
 		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
 
 		if (last->vdev_unspare) {
 			oldvd = first;
 			newvd = last;
 		} else if (first->vdev_unspare) {
 			oldvd = last;
 			newvd = first;
 		} else {
 			oldvd = NULL;
 		}
 
 		if (oldvd != NULL &&
 		    vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 		    !vdev_dtl_required(oldvd))
 			return (oldvd);
 
 		vdev_propagate_state(vd);
 
 		/*
 		 * If there are more than two spares attached to a disk,
 		 * and those spares are not required, then we want to
 		 * attempt to free them up now so that they can be used
 		 * by other pools.  Once we're back down to a single
 		 * disk+spare, we stop removing them.
 		 */
 		if (vd->vdev_children > 2) {
 			newvd = vd->vdev_child[1];
 
 			if (newvd->vdev_isspare && last->vdev_isspare &&
 			    vdev_dtl_empty(last, DTL_MISSING) &&
 			    vdev_dtl_empty(last, DTL_OUTAGE) &&
 			    !vdev_dtl_required(newvd))
 				return (newvd);
 		}
 	}
 
 	return (NULL);
 }
 
 static void
 spa_vdev_resilver_done(spa_t *spa)
 {
 	vdev_t *vd, *pvd, *ppvd;
 	uint64_t guid, sguid, pguid, ppguid;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
 		pvd = vd->vdev_parent;
 		ppvd = pvd->vdev_parent;
 		guid = vd->vdev_guid;
 		pguid = pvd->vdev_guid;
 		ppguid = ppvd->vdev_guid;
 		sguid = 0;
 		/*
 		 * If we have just finished replacing a hot spared device, then
 		 * we need to detach the parent's first child (the original hot
 		 * spare) as well.
 		 */
 		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
 		    ppvd->vdev_children == 2) {
 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
 			sguid = ppvd->vdev_child[1]->vdev_guid;
 		}
 		ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
 
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
 			return;
 		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
 			return;
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * If a detach was not performed above replace waiters will not have
 	 * been notified.  In which case we must do so now.
 	 */
 	spa_notify_waiters(spa);
 }
 
 /*
  * Update the stored path or FRU for this vdev.
  */
 static int
 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
     boolean_t ispath)
 {
 	vdev_t *vd;
 	boolean_t sync = B_FALSE;
 
 	ASSERT(spa_writeable(spa));
 
 	spa_vdev_state_enter(spa, SCL_ALL);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENOENT));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	if (ispath) {
 		if (strcmp(value, vd->vdev_path) != 0) {
 			spa_strfree(vd->vdev_path);
 			vd->vdev_path = spa_strdup(value);
 			sync = B_TRUE;
 		}
 	} else {
 		if (vd->vdev_fru == NULL) {
 			vd->vdev_fru = spa_strdup(value);
 			sync = B_TRUE;
 		} else if (strcmp(value, vd->vdev_fru) != 0) {
 			spa_strfree(vd->vdev_fru);
 			vd->vdev_fru = spa_strdup(value);
 			sync = B_TRUE;
 		}
 	}
 
 	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
 }
 
 int
 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
 {
 	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
 }
 
 int
 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
 {
 	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
 }
 
 /*
  * ==========================================================================
  * SPA Scanning
  * ==========================================================================
  */
 int
 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 
 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
 		return (SET_ERROR(EBUSY));
 
 	return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
 }
 
 int
 spa_scan_stop(spa_t *spa)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
 		return (SET_ERROR(EBUSY));
 
 	return (dsl_scan_cancel(spa->spa_dsl_pool));
 }
 
 int
 spa_scan(spa_t *spa, pool_scan_func_t func)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 
 	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
 		return (SET_ERROR(ENOTSUP));
 
 	if (func == POOL_SCAN_RESILVER &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * If a resilver was requested, but there is no DTL on a
 	 * writeable leaf device, we have nothing to do.
 	 */
 	if (func == POOL_SCAN_RESILVER &&
 	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 		return (0);
 	}
 
 	if (func == POOL_SCAN_ERRORSCRUB &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG))
 		return (SET_ERROR(ENOTSUP));
 
 	return (dsl_scan(spa->spa_dsl_pool, func));
 }
 
 /*
  * ==========================================================================
  * SPA async task processing
  * ==========================================================================
  */
 
 static void
 spa_async_remove(spa_t *spa, vdev_t *vd)
 {
 	if (vd->vdev_remove_wanted) {
 		vd->vdev_remove_wanted = B_FALSE;
 		vd->vdev_delayed_close = B_FALSE;
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
 
 		/*
 		 * We want to clear the stats, but we don't want to do a full
 		 * vdev_clear() as that will cause us to throw away
 		 * degraded/faulted state as well as attempt to reopen the
 		 * device, all of which is a waste.
 		 */
 		vd->vdev_stat.vs_read_errors = 0;
 		vd->vdev_stat.vs_write_errors = 0;
 		vd->vdev_stat.vs_checksum_errors = 0;
 
 		vdev_state_dirty(vd->vdev_top);
 
 		/* Tell userspace that the vdev is gone. */
 		zfs_post_remove(spa, vd);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_async_remove(spa, vd->vdev_child[c]);
 }
 
 static void
 spa_async_probe(spa_t *spa, vdev_t *vd)
 {
 	if (vd->vdev_probe_wanted) {
 		vd->vdev_probe_wanted = B_FALSE;
 		vdev_reopen(vd);	/* vdev_open() does the actual probe */
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_async_probe(spa, vd->vdev_child[c]);
 }
 
 static void
 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
 {
 	if (!spa->spa_autoexpand)
 		return;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 		spa_async_autoexpand(spa, cvd);
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
 		return;
 
 	spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND);
 }
 
 static __attribute__((noreturn)) void
 spa_async_thread(void *arg)
 {
 	spa_t *spa = (spa_t *)arg;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	int tasks;
 
 	ASSERT(spa->spa_sync_on);
 
 	mutex_enter(&spa->spa_async_lock);
 	tasks = spa->spa_async_tasks;
 	spa->spa_async_tasks = 0;
 	mutex_exit(&spa->spa_async_lock);
 
 	/*
 	 * See if the config needs to be updated.
 	 */
 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
 		uint64_t old_space, new_space;
 
 		mutex_enter(&spa_namespace_lock);
 		old_space = metaslab_class_get_space(spa_normal_class(spa));
 		old_space += metaslab_class_get_space(spa_special_class(spa));
 		old_space += metaslab_class_get_space(spa_dedup_class(spa));
 		old_space += metaslab_class_get_space(
 		    spa_embedded_log_class(spa));
 
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 
 		new_space = metaslab_class_get_space(spa_normal_class(spa));
 		new_space += metaslab_class_get_space(spa_special_class(spa));
 		new_space += metaslab_class_get_space(spa_dedup_class(spa));
 		new_space += metaslab_class_get_space(
 		    spa_embedded_log_class(spa));
 		mutex_exit(&spa_namespace_lock);
 
 		/*
 		 * If the pool grew as a result of the config update,
 		 * then log an internal history event.
 		 */
 		if (new_space != old_space) {
 			spa_history_log_internal(spa, "vdev online", NULL,
 			    "pool '%s' size: %llu(+%llu)",
 			    spa_name(spa), (u_longlong_t)new_space,
 			    (u_longlong_t)(new_space - old_space));
 		}
 	}
 
 	/*
 	 * See if any devices need to be marked REMOVED.
 	 */
 	if (tasks & SPA_ASYNC_REMOVE) {
 		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_remove(spa, spa->spa_root_vdev);
 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
 			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
 			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
 	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_async_autoexpand(spa, spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	/*
 	 * See if any devices need to be probed.
 	 */
 	if (tasks & SPA_ASYNC_PROBE) {
 		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_probe(spa, spa->spa_root_vdev);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
 	/*
 	 * If any devices are done replacing, detach them.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER_DONE ||
 	    tasks & SPA_ASYNC_REBUILD_DONE ||
 	    tasks & SPA_ASYNC_DETACH_SPARE) {
 		spa_vdev_resilver_done(spa);
 	}
 
 	/*
 	 * Kick off a resilver.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER &&
 	    !vdev_rebuild_active(spa->spa_root_vdev) &&
 	    (!dsl_scan_resilvering(dp) ||
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
 		dsl_scan_restart_resilver(dp, 0);
 
 	if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
 		mutex_enter(&spa_namespace_lock);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_initialize_restart(spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	if (tasks & SPA_ASYNC_TRIM_RESTART) {
 		mutex_enter(&spa_namespace_lock);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_trim_restart(spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) {
 		mutex_enter(&spa_namespace_lock);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_autotrim_restart(spa);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	/*
 	 * Kick off L2 cache whole device TRIM.
 	 */
 	if (tasks & SPA_ASYNC_L2CACHE_TRIM) {
 		mutex_enter(&spa_namespace_lock);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_trim_l2arc(spa);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	/*
 	 * Kick off L2 cache rebuilding.
 	 */
 	if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
 		mutex_enter(&spa_namespace_lock);
 		spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER);
 		l2arc_spa_rebuild_start(spa);
 		spa_config_exit(spa, SCL_L2ARC, FTAG);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	/*
 	 * Let the world know that we're done.
 	 */
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_thread = NULL;
 	cv_broadcast(&spa->spa_async_cv);
 	mutex_exit(&spa->spa_async_lock);
 	thread_exit();
 }
 
 void
 spa_async_suspend(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_suspended++;
 	while (spa->spa_async_thread != NULL)
 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
 	mutex_exit(&spa->spa_async_lock);
 
 	spa_vdev_remove_suspend(spa);
 
 	zthr_t *condense_thread = spa->spa_condense_zthr;
 	if (condense_thread != NULL)
 		zthr_cancel(condense_thread);
 
 	zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
 	if (raidz_expand_thread != NULL)
 		zthr_cancel(raidz_expand_thread);
 
 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
 	if (discard_thread != NULL)
 		zthr_cancel(discard_thread);
 
 	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
 	if (ll_delete_thread != NULL)
 		zthr_cancel(ll_delete_thread);
 
 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
 	if (ll_condense_thread != NULL)
 		zthr_cancel(ll_condense_thread);
 }
 
 void
 spa_async_resume(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	ASSERT(spa->spa_async_suspended != 0);
 	spa->spa_async_suspended--;
 	mutex_exit(&spa->spa_async_lock);
 	spa_restart_removal(spa);
 
 	zthr_t *condense_thread = spa->spa_condense_zthr;
 	if (condense_thread != NULL)
 		zthr_resume(condense_thread);
 
 	zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
 	if (raidz_expand_thread != NULL)
 		zthr_resume(raidz_expand_thread);
 
 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
 	if (discard_thread != NULL)
 		zthr_resume(discard_thread);
 
 	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
 	if (ll_delete_thread != NULL)
 		zthr_resume(ll_delete_thread);
 
 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
 	if (ll_condense_thread != NULL)
 		zthr_resume(ll_condense_thread);
 }
 
 static boolean_t
 spa_async_tasks_pending(spa_t *spa)
 {
 	uint_t non_config_tasks;
 	uint_t config_task;
 	boolean_t config_task_suspended;
 
 	non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
 	config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
 	if (spa->spa_ccw_fail_time == 0) {
 		config_task_suspended = B_FALSE;
 	} else {
 		config_task_suspended =
 		    (gethrtime() - spa->spa_ccw_fail_time) <
 		    ((hrtime_t)zfs_ccw_retry_interval * NANOSEC);
 	}
 
 	return (non_config_tasks || (config_task && !config_task_suspended));
 }
 
 static void
 spa_async_dispatch(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	if (spa_async_tasks_pending(spa) &&
 	    !spa->spa_async_suspended &&
 	    spa->spa_async_thread == NULL)
 		spa->spa_async_thread = thread_create(NULL, 0,
 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
 	mutex_exit(&spa->spa_async_lock);
 }
 
 void
 spa_async_request(spa_t *spa, int task)
 {
 	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_tasks |= task;
 	mutex_exit(&spa->spa_async_lock);
 }
 
 int
 spa_async_tasks(spa_t *spa)
 {
 	return (spa->spa_async_tasks);
 }
 
 /*
  * ==========================================================================
  * SPA syncing routines
  * ==========================================================================
  */
 
 
 static int
 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	bpobj_t *bpo = arg;
 	bpobj_enqueue(bpo, bp, bp_freed, tx);
 	return (0);
 }
 
 int
 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx));
 }
 
 int
 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx));
 }
 
 static int
 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	zio_t *pio = arg;
 
 	zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp,
 	    pio->io_flags));
 	return (0);
 }
 
 static int
 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT(!bp_freed);
 	return (spa_free_sync_cb(arg, bp, tx));
 }
 
 /*
  * Note: this simple function is not inlined to make it easier to dtrace the
  * amount of time spent syncing frees.
  */
 static void
 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
 {
 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
 	bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
 	VERIFY(zio_wait(zio) == 0);
 }
 
 /*
  * Note: this simple function is not inlined to make it easier to dtrace the
  * amount of time spent syncing deferred frees.
  */
 static void
 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
 {
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	/*
 	 * Note:
 	 * If the log space map feature is active, we stop deferring
 	 * frees to the next TXG and therefore running this function
 	 * would be considered a no-op as spa_deferred_bpobj should
 	 * not have any entries.
 	 *
 	 * That said we run this function anyway (instead of returning
 	 * immediately) for the edge-case scenario where we just
 	 * activated the log space map feature in this TXG but we have
 	 * deferred frees from the previous TXG.
 	 */
 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
 	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
 	    bpobj_spa_free_sync_cb, zio, tx), ==, 0);
 	VERIFY0(zio_wait(zio));
 }
 
 static void
 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
 {
 	char *packed = NULL;
 	size_t bufsize;
 	size_t nvsize = 0;
 	dmu_buf_t *db;
 
 	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
 
 	/*
 	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
 	 * information.  This avoids the dmu_buf_will_dirty() path and
 	 * saves us a pre-read to get data we don't actually care about.
 	 */
 	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
 	packed = vmem_alloc(bufsize, KM_SLEEP);
 
 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
 	    KM_SLEEP) == 0);
 	memset(packed + nvsize, 0, bufsize - nvsize);
 
 	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
 
 	vmem_free(packed, bufsize);
 
 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	*(uint64_t *)db->db_data = nvsize;
 	dmu_buf_rele(db, FTAG);
 }
 
 static void
 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
     const char *config, const char *entry)
 {
 	nvlist_t *nvroot;
 	nvlist_t **list;
 	int i;
 
 	if (!sav->sav_sync)
 		return;
 
 	/*
 	 * Update the MOS nvlist describing the list of available devices.
 	 * spa_validate_aux() will have already made sure this nvlist is
 	 * valid and the vdevs are labeled appropriately.
 	 */
 	if (sav->sav_object == 0) {
 		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
 		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
 		    sizeof (uint64_t), tx);
 		VERIFY(zap_update(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
 		    &sav->sav_object, tx) == 0);
 	}
 
 	nvroot = fnvlist_alloc();
 	if (sav->sav_count == 0) {
 		fnvlist_add_nvlist_array(nvroot, config,
 		    (const nvlist_t * const *)NULL, 0);
 	} else {
 		list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP);
 		for (i = 0; i < sav->sav_count; i++)
 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
 			    B_FALSE, VDEV_CONFIG_L2CACHE);
 		fnvlist_add_nvlist_array(nvroot, config,
 		    (const nvlist_t * const *)list, sav->sav_count);
 		for (i = 0; i < sav->sav_count; i++)
 			nvlist_free(list[i]);
 		kmem_free(list, sav->sav_count * sizeof (void *));
 	}
 
 	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
 	nvlist_free(nvroot);
 
 	sav->sav_sync = B_FALSE;
 }
 
 /*
  * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
  * The all-vdev ZAP must be empty.
  */
 static void
 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	if (vd->vdev_root_zap != 0 &&
 	    spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) {
 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
 		    vd->vdev_root_zap, tx));
 	}
 	if (vd->vdev_top_zap != 0) {
 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
 		    vd->vdev_top_zap, tx));
 	}
 	if (vd->vdev_leaf_zap != 0) {
 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
 		    vd->vdev_leaf_zap, tx));
 	}
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		spa_avz_build(vd->vdev_child[i], avz, tx);
 	}
 }
 
 static void
 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 {
 	nvlist_t *config;
 
 	/*
 	 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
 	 * its config may not be dirty but we still need to build per-vdev ZAPs.
 	 * Similarly, if the pool is being assembled (e.g. after a split), we
 	 * need to rebuild the AVZ although the config may not be dirty.
 	 */
 	if (list_is_empty(&spa->spa_config_dirty_list) &&
 	    spa->spa_avz_action == AVZ_ACTION_NONE)
 		return;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 	ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
 	    spa->spa_avz_action == AVZ_ACTION_INITIALIZE ||
 	    spa->spa_all_vdev_zaps != 0);
 
 	if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
 		/* Make and build the new AVZ */
 		uint64_t new_avz = zap_create(spa->spa_meta_objset,
 		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
 		spa_avz_build(spa->spa_root_vdev, new_avz, tx);
 
 		/* Diff old AVZ with new one */
 		zap_cursor_t zc;
 		zap_attribute_t za;
 
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t vdzap = za.za_first_integer;
 			if (zap_lookup_int(spa->spa_meta_objset, new_avz,
 			    vdzap) == ENOENT) {
 				/*
 				 * ZAP is listed in old AVZ but not in new one;
 				 * destroy it
 				 */
 				VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
 				    tx));
 			}
 		}
 
 		zap_cursor_fini(&zc);
 
 		/* Destroy the old AVZ */
 		VERIFY0(zap_destroy(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, tx));
 
 		/* Replace the old AVZ in the dir obj with the new one */
 		VERIFY0(zap_update(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
 		    sizeof (new_avz), 1, &new_avz, tx));
 
 		spa->spa_all_vdev_zaps = new_avz;
 	} else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
 		zap_cursor_t zc;
 		zap_attribute_t za;
 
 		/* Walk through the AVZ and destroy all listed ZAPs */
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t zap = za.za_first_integer;
 			VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
 		}
 
 		zap_cursor_fini(&zc);
 
 		/* Destroy and unlink the AVZ itself */
 		VERIFY0(zap_destroy(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, tx));
 		VERIFY0(zap_remove(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
 		spa->spa_all_vdev_zaps = 0;
 	}
 
 	if (spa->spa_all_vdev_zaps == 0) {
 		spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
 		    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_VDEV_ZAP_MAP, tx);
 	}
 	spa->spa_avz_action = AVZ_ACTION_NONE;
 
 	/* Create ZAPs for vdevs that don't have them. */
 	vdev_construct_zaps(spa->spa_root_vdev, tx);
 
 	config = spa_config_generate(spa, spa->spa_root_vdev,
 	    dmu_tx_get_txg(tx), B_FALSE);
 
 	/*
 	 * If we're upgrading the spa version then make sure that
 	 * the config object gets updated with the correct version.
 	 */
 	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
 		    spa->spa_uberblock.ub_version);
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	nvlist_free(spa->spa_config_syncing);
 	spa->spa_config_syncing = config;
 
 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
 }
 
 static void
 spa_sync_version(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *versionp = arg;
 	uint64_t version = *versionp;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	/*
 	 * Setting the version is special cased when first creating the pool.
 	 */
 	ASSERT(tx->tx_txg != TXG_INITIAL);
 
 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 	ASSERT(version >= spa_version(spa));
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa_history_log_internal(spa, "set", tx, "version=%lld",
 	    (longlong_t)version);
 }
 
 /*
  * Set zpool properties.
  */
 static void
 spa_sync_props(void *arg, dmu_tx_t *tx)
 {
 	nvlist_t *nvp = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	nvpair_t *elem = NULL;
 
 	mutex_enter(&spa->spa_props_lock);
 
 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
 		uint64_t intval;
 		const char *strval, *fname;
 		zpool_prop_t prop;
 		const char *propname;
 		const char *elemname = nvpair_name(elem);
 		zprop_type_t proptype;
 		spa_feature_t fid;
 
 		switch (prop = zpool_name_to_prop(elemname)) {
 		case ZPOOL_PROP_VERSION:
 			intval = fnvpair_value_uint64(elem);
 			/*
 			 * The version is synced separately before other
 			 * properties and should be correct by now.
 			 */
 			ASSERT3U(spa_version(spa), >=, intval);
 			break;
 
 		case ZPOOL_PROP_ALTROOT:
 			/*
 			 * 'altroot' is a non-persistent property. It should
 			 * have been set temporarily at creation or import time.
 			 */
 			ASSERT(spa->spa_root != NULL);
 			break;
 
 		case ZPOOL_PROP_READONLY:
 		case ZPOOL_PROP_CACHEFILE:
 			/*
 			 * 'readonly' and 'cachefile' are also non-persistent
 			 * properties.
 			 */
 			break;
 		case ZPOOL_PROP_COMMENT:
 			strval = fnvpair_value_string(elem);
 			if (spa->spa_comment != NULL)
 				spa_strfree(spa->spa_comment);
 			spa->spa_comment = spa_strdup(strval);
 			/*
 			 * We need to dirty the configuration on all the vdevs
 			 * so that their labels get updated.  We also need to
 			 * update the cache file to keep it in sync with the
 			 * MOS version. It's unnecessary to do this for pool
 			 * creation since the vdev's configuration has already
 			 * been dirtied.
 			 */
 			if (tx->tx_txg != TXG_INITIAL) {
 				vdev_config_dirty(spa->spa_root_vdev);
 				spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 			}
 			spa_history_log_internal(spa, "set", tx,
 			    "%s=%s", elemname, strval);
 			break;
 		case ZPOOL_PROP_COMPATIBILITY:
 			strval = fnvpair_value_string(elem);
 			if (spa->spa_compatibility != NULL)
 				spa_strfree(spa->spa_compatibility);
 			spa->spa_compatibility = spa_strdup(strval);
 			/*
 			 * Dirty the configuration on vdevs as above.
 			 */
 			if (tx->tx_txg != TXG_INITIAL) {
 				vdev_config_dirty(spa->spa_root_vdev);
 				spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 			}
 
 			spa_history_log_internal(spa, "set", tx,
 			    "%s=%s", nvpair_name(elem), strval);
 			break;
 
 		case ZPOOL_PROP_INVAL:
 			if (zpool_prop_feature(elemname)) {
 				fname = strchr(elemname, '@') + 1;
 				VERIFY0(zfeature_lookup_name(fname, &fid));
 
 				spa_feature_enable(spa, fid, tx);
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=enabled", elemname);
 				break;
 			} else if (!zfs_prop_user(elemname)) {
 				ASSERT(zpool_prop_feature(elemname));
 				break;
 			}
 			zfs_fallthrough;
 		default:
 			/*
 			 * Set pool property values in the poolprops mos object.
 			 */
 			if (spa->spa_pool_props_object == 0) {
 				spa->spa_pool_props_object =
 				    zap_create_link(mos, DMU_OT_POOL_PROPS,
 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
 				    tx);
 			}
 
 			/* normalize the property name */
 			if (prop == ZPOOL_PROP_INVAL) {
 				propname = elemname;
 				proptype = PROP_TYPE_STRING;
 			} else {
 				propname = zpool_prop_to_name(prop);
 				proptype = zpool_prop_get_type(prop);
 			}
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				ASSERT(proptype == PROP_TYPE_STRING);
 				strval = fnvpair_value_string(elem);
 				VERIFY0(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
 				    1, strlen(strval) + 1, strval, tx));
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=%s", elemname, strval);
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
 				intval = fnvpair_value_uint64(elem);
 
 				if (proptype == PROP_TYPE_INDEX) {
 					const char *unused;
 					VERIFY0(zpool_prop_index_to_string(
 					    prop, intval, &unused));
 				}
 				VERIFY0(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
 				    8, 1, &intval, tx));
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=%lld", elemname,
 				    (longlong_t)intval);
 
 				switch (prop) {
 				case ZPOOL_PROP_DELEGATION:
 					spa->spa_delegation = intval;
 					break;
 				case ZPOOL_PROP_BOOTFS:
 					spa->spa_bootfs = intval;
 					break;
 				case ZPOOL_PROP_FAILUREMODE:
 					spa->spa_failmode = intval;
 					break;
 				case ZPOOL_PROP_AUTOTRIM:
 					spa->spa_autotrim = intval;
 					spa_async_request(spa,
 					    SPA_ASYNC_AUTOTRIM_RESTART);
 					break;
 				case ZPOOL_PROP_AUTOEXPAND:
 					spa->spa_autoexpand = intval;
 					if (tx->tx_txg != TXG_INITIAL)
 						spa_async_request(spa,
 						    SPA_ASYNC_AUTOEXPAND);
 					break;
 				case ZPOOL_PROP_MULTIHOST:
 					spa->spa_multihost = intval;
 					break;
 				default:
 					break;
 				}
 			} else {
 				ASSERT(0); /* not allowed */
 			}
 		}
 
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 }
 
 /*
  * Perform one-time upgrade on-disk changes.  spa_version() does not
  * reflect the new version this txg, so there must be no changes this
  * txg to anything that the upgrade code depends on after it executes.
  * Therefore this must be called after dsl_pool_sync() does the sync
  * tasks.
  */
 static void
 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
 {
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
 		dsl_pool_create_origin(dp, tx);
 
 		/* Keeping the origin open increases spa_minref */
 		spa->spa_minref += 3;
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
 		dsl_pool_upgrade_clones(dp, tx);
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
 		dsl_pool_upgrade_dir_clones(dp, tx);
 
 		/* Keeping the freedir open increases spa_minref */
 		spa->spa_minref += 3;
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 		spa_feature_create_zap_objects(spa, tx);
 	}
 
 	/*
 	 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
 	 * when possibility to use lz4 compression for metadata was added
 	 * Old pools that have this feature enabled must be upgraded to have
 	 * this feature active
 	 */
 	if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 		boolean_t lz4_en = spa_feature_is_enabled(spa,
 		    SPA_FEATURE_LZ4_COMPRESS);
 		boolean_t lz4_ac = spa_feature_is_active(spa,
 		    SPA_FEATURE_LZ4_COMPRESS);
 
 		if (lz4_en && !lz4_ac)
 			spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
 	}
 
 	/*
 	 * If we haven't written the salt, do so now.  Note that the
 	 * feature may not be activated yet, but that's fine since
 	 * the presence of this ZAP entry is backwards compatible.
 	 */
 	if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_CHECKSUM_SALT) == ENOENT) {
 		VERIFY0(zap_add(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
 		    sizeof (spa->spa_cksum_salt.zcs_bytes),
 		    spa->spa_cksum_salt.zcs_bytes, tx));
 	}
 
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 }
 
 static void
 vdev_indirect_state_sync_verify(vdev_t *vd)
 {
 	vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping;
 	vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births;
 
 	if (vd->vdev_ops == &vdev_indirect_ops) {
 		ASSERT(vim != NULL);
 		ASSERT(vib != NULL);
 	}
 
 	uint64_t obsolete_sm_object = 0;
 	ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	if (obsolete_sm_object != 0) {
 		ASSERT(vd->vdev_obsolete_sm != NULL);
 		ASSERT(vd->vdev_removing ||
 		    vd->vdev_ops == &vdev_indirect_ops);
 		ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
 		ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
 		ASSERT3U(obsolete_sm_object, ==,
 		    space_map_object(vd->vdev_obsolete_sm));
 		ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
 		    space_map_allocated(vd->vdev_obsolete_sm));
 	}
 	ASSERT(vd->vdev_obsolete_segments != NULL);
 
 	/*
 	 * Since frees / remaps to an indirect vdev can only
 	 * happen in syncing context, the obsolete segments
 	 * tree must be empty when we start syncing.
 	 */
 	ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
 }
 
 /*
  * Set the top-level vdev's max queue depth. Evaluate each top-level's
  * async write queue depth in case it changed. The max queue depth will
  * not change in the middle of syncing out this txg.
  */
 static void
 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
 {
 	ASSERT(spa_writeable(spa));
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
 	    zfs_vdev_queue_depth_pct / 100;
 	metaslab_class_t *normal = spa_normal_class(spa);
 	metaslab_class_t *special = spa_special_class(spa);
 	metaslab_class_t *dedup = spa_dedup_class(spa);
 
 	uint64_t slots_per_allocator = 0;
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 
 		metaslab_group_t *mg = tvd->vdev_mg;
 		if (mg == NULL || !metaslab_group_initialized(mg))
 			continue;
 
 		metaslab_class_t *mc = mg->mg_class;
 		if (mc != normal && mc != special && mc != dedup)
 			continue;
 
 		/*
 		 * It is safe to do a lock-free check here because only async
 		 * allocations look at mg_max_alloc_queue_depth, and async
 		 * allocations all happen from spa_sync().
 		 */
 		for (int i = 0; i < mg->mg_allocators; i++) {
 			ASSERT0(zfs_refcount_count(
 			    &(mg->mg_allocator[i].mga_alloc_queue_depth)));
 		}
 		mg->mg_max_alloc_queue_depth = max_queue_depth;
 
 		for (int i = 0; i < mg->mg_allocators; i++) {
 			mg->mg_allocator[i].mga_cur_max_alloc_queue_depth =
 			    zfs_vdev_def_queue_depth;
 		}
 		slots_per_allocator += zfs_vdev_def_queue_depth;
 	}
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		ASSERT0(zfs_refcount_count(&normal->mc_allocator[i].
 		    mca_alloc_slots));
 		ASSERT0(zfs_refcount_count(&special->mc_allocator[i].
 		    mca_alloc_slots));
 		ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i].
 		    mca_alloc_slots));
 		normal->mc_allocator[i].mca_alloc_max_slots =
 		    slots_per_allocator;
 		special->mc_allocator[i].mca_alloc_max_slots =
 		    slots_per_allocator;
 		dedup->mc_allocator[i].mca_alloc_max_slots =
 		    slots_per_allocator;
 	}
 	normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 	special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 	dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 }
 
 static void
 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx)
 {
 	ASSERT(spa_writeable(spa));
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 		vdev_indirect_state_sync_verify(vd);
 
 		if (vdev_indirect_should_condense(vd)) {
 			spa_condense_indirect_start_sync(vd, tx);
 			break;
 		}
 	}
 }
 
 static void
 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	uint64_t txg = tx->tx_txg;
 	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
 
 	do {
 		int pass = ++spa->spa_sync_pass;
 
 		spa_sync_config_object(spa, tx);
 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
 		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
 		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
 		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
 		spa_errlog_sync(spa, txg);
 		dsl_pool_sync(dp, txg);
 
 		if (pass < zfs_sync_pass_deferred_free ||
 		    spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
 			/*
 			 * If the log space map feature is active we don't
 			 * care about deferred frees and the deferred bpobj
 			 * as the log space map should effectively have the
 			 * same results (i.e. appending only to one object).
 			 */
 			spa_sync_frees(spa, free_bpl, tx);
 		} else {
 			/*
 			 * We can not defer frees in pass 1, because
 			 * we sync the deferred frees later in pass 1.
 			 */
 			ASSERT3U(pass, >, 1);
 			bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb,
 			    &spa->spa_deferred_bpobj, tx);
 		}
 
 		brt_sync(spa, txg);
 		ddt_sync(spa, txg);
 		dsl_scan_sync(dp, tx);
 		dsl_errorscrub_sync(dp, tx);
 		svr_sync(spa, tx);
 		spa_sync_upgrades(spa, tx);
 
 		spa_flush_metaslabs(spa, tx);
 
 		vdev_t *vd = NULL;
 		while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
 		    != NULL)
 			vdev_sync(vd, txg);
 
 		if (pass == 1) {
 			/*
 			 * dsl_pool_sync() -> dp_sync_tasks may have dirtied
 			 * the config. If that happens, this txg should not
 			 * be a no-op. So we must sync the config to the MOS
 			 * before checking for no-op.
 			 *
 			 * Note that when the config is dirty, it will
 			 * be written to the MOS (i.e. the MOS will be
 			 * dirtied) every time we call spa_sync_config_object()
 			 * in this txg.  Therefore we can't call this after
 			 * dsl_pool_sync() every pass, because it would
 			 * prevent us from converging, since we'd dirty
 			 * the MOS every pass.
 			 *
 			 * Sync tasks can only be processed in pass 1, so
 			 * there's no need to do this in later passes.
 			 */
 			spa_sync_config_object(spa, tx);
 		}
 
 		/*
 		 * Note: We need to check if the MOS is dirty because we could
 		 * have marked the MOS dirty without updating the uberblock
 		 * (e.g. if we have sync tasks but no dirty user data). We need
 		 * to check the uberblock's rootbp because it is updated if we
 		 * have synced out dirty data (though in this case the MOS will
 		 * most likely also be dirty due to second order effects, we
 		 * don't want to rely on that here).
 		 */
 		if (pass == 1 &&
-		    spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+		    BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
 		    !dmu_objset_is_dirty(mos, txg)) {
 			/*
 			 * Nothing changed on the first pass, therefore this
 			 * TXG is a no-op. Avoid syncing deferred frees, so
 			 * that we can keep this TXG as a no-op.
 			 */
 			ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 			ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 			ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
 			ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg));
 			break;
 		}
 
 		spa_sync_deferred_frees(spa, tx);
 	} while (dmu_objset_is_dirty(mos, txg));
 }
 
 /*
  * Rewrite the vdev configuration (which includes the uberblock) to
  * commit the transaction group.
  *
  * If there are no dirty vdevs, we sync the uberblock to a few random
  * top-level vdevs that are known to be visible in the config cache
  * (see spa_vdev_add() for a complete description). If there *are* dirty
  * vdevs, sync the uberblock to all vdevs.
  */
 static void
 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t txg = tx->tx_txg;
 
 	for (;;) {
 		int error = 0;
 
 		/*
 		 * We hold SCL_STATE to prevent vdev open/close/etc.
 		 * while we're attempting to write the vdev labels.
 		 */
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 		if (list_is_empty(&spa->spa_config_dirty_list)) {
 			vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
 			int svdcount = 0;
 			int children = rvd->vdev_children;
 			int c0 = random_in_range(children);
 
 			for (int c = 0; c < children; c++) {
 				vdev_t *vd =
 				    rvd->vdev_child[(c0 + c) % children];
 
 				/* Stop when revisiting the first vdev */
 				if (c > 0 && svd[0] == vd)
 					break;
 
 				if (vd->vdev_ms_array == 0 ||
 				    vd->vdev_islog ||
 				    !vdev_is_concrete(vd))
 					continue;
 
 				svd[svdcount++] = vd;
 				if (svdcount == SPA_SYNC_MIN_VDEVS)
 					break;
 			}
 			error = vdev_config_sync(svd, svdcount, txg);
 		} else {
 			error = vdev_config_sync(rvd->vdev_child,
 			    rvd->vdev_children, txg);
 		}
 
 		if (error == 0)
 			spa->spa_last_synced_guid = rvd->vdev_guid;
 
 		spa_config_exit(spa, SCL_STATE, FTAG);
 
 		if (error == 0)
 			break;
 		zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
 		zio_resume_wait(spa);
 	}
 }
 
 /*
  * Sync the specified transaction group.  New blocks may be dirtied as
  * part of the process, so we iterate until it converges.
  */
 void
 spa_sync(spa_t *spa, uint64_t txg)
 {
 	vdev_t *vd = NULL;
 
 	VERIFY(spa_writeable(spa));
 
 	/*
 	 * Wait for i/os issued in open context that need to complete
 	 * before this txg syncs.
 	 */
 	(void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
 	spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
 	    ZIO_FLAG_CANFAIL);
 
 	/*
 	 * Now that there can be no more cloning in this transaction group,
 	 * but we are still before issuing frees, we can process pending BRT
 	 * updates.
 	 */
 	brt_pending_apply(spa, txg);
 
 	/*
 	 * Lock out configuration changes.
 	 */
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	spa->spa_syncing_txg = txg;
 	spa->spa_sync_pass = 0;
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		mutex_enter(&spa->spa_allocs[i].spaa_lock);
 		VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
 		mutex_exit(&spa->spa_allocs[i].spaa_lock);
 	}
 
 	/*
 	 * If there are any pending vdev state changes, convert them
 	 * into config changes that go out with this transaction group.
 	 */
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
 		/* Avoid holding the write lock unless actually necessary */
 		if (vd->vdev_aux == NULL) {
 			vdev_state_clean(vd);
 			vdev_config_dirty(vd);
 			continue;
 		}
 		/*
 		 * We need the write lock here because, for aux vdevs,
 		 * calling vdev_config_dirty() modifies sav_config.
 		 * This is ugly and will become unnecessary when we
 		 * eliminate the aux vdev wart by integrating all vdevs
 		 * into the root vdev tree.
 		 */
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
 		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
 			vdev_state_clean(vd);
 			vdev_config_dirty(vd);
 		}
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 	}
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 
 	spa->spa_sync_starttime = gethrtime();
 	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
 	spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
 	    spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
 	    NSEC_TO_TICK(spa->spa_deadman_synctime));
 
 	/*
 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
 	 * set spa_deflate if we have no raid-z vdevs.
 	 */
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		vdev_t *rvd = spa->spa_root_vdev;
 
 		int i;
 		for (i = 0; i < rvd->vdev_children; i++) {
 			vd = rvd->vdev_child[i];
 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
 				break;
 		}
 		if (i == rvd->vdev_children) {
 			spa->spa_deflate = TRUE;
 			VERIFY0(zap_add(spa->spa_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
 		}
 	}
 
 	spa_sync_adjust_vdev_max_queue_depth(spa);
 
 	spa_sync_condense_indirect(spa, tx);
 
 	spa_sync_iterate_to_convergence(spa, tx);
 
 #ifdef ZFS_DEBUG
 	if (!list_is_empty(&spa->spa_config_dirty_list)) {
 	/*
 	 * Make sure that the number of ZAPs for all the vdevs matches
 	 * the number of ZAPs in the per-vdev ZAP list. This only gets
 	 * called if the config is dirty; otherwise there may be
 	 * outstanding AVZ operations that weren't completed in
 	 * spa_sync_config_object.
 	 */
 		uint64_t all_vdev_zap_entry_count;
 		ASSERT0(zap_count(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
 		ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
 		    all_vdev_zap_entry_count);
 	}
 #endif
 
 	if (spa->spa_vdev_removal != NULL) {
 		ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
 	}
 
 	spa_sync_rewrite_vdev_config(spa, tx);
 	dmu_tx_commit(tx);
 
 	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
 	spa->spa_deadman_tqid = 0;
 
 	/*
 	 * Clear the dirty config list.
 	 */
 	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
 		vdev_config_clean(vd);
 
 	/*
 	 * Now that the new config has synced transactionally,
 	 * let it become visible to the config cache.
 	 */
 	if (spa->spa_config_syncing != NULL) {
 		spa_config_set(spa, spa->spa_config_syncing);
 		spa->spa_config_txg = txg;
 		spa->spa_config_syncing = NULL;
 	}
 
 	dsl_pool_sync_done(dp, txg);
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		mutex_enter(&spa->spa_allocs[i].spaa_lock);
 		VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
 		mutex_exit(&spa->spa_allocs[i].spaa_lock);
 	}
 
 	/*
 	 * Update usable space statistics.
 	 */
 	while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
 	    != NULL)
 		vdev_sync_done(vd, txg);
 
 	metaslab_class_evict_old(spa->spa_normal_class, txg);
 	metaslab_class_evict_old(spa->spa_log_class, txg);
 
 	spa_sync_close_syncing_log_sm(spa);
 
 	spa_update_dspace(spa);
 
 	if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON)
 		vdev_autotrim_kick(spa);
 
 	/*
 	 * It had better be the case that we didn't dirty anything
 	 * since vdev_config_sync().
 	 */
 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
 
 	while (zfs_pause_spa_sync)
 		delay(1);
 
 	spa->spa_sync_pass = 0;
 
 	/*
 	 * Update the last synced uberblock here. We want to do this at
 	 * the end of spa_sync() so that consumers of spa_last_synced_txg()
 	 * will be guaranteed that all the processing associated with
 	 * that txg has been completed.
 	 */
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	spa_handle_ignored_writes(spa);
 
 	/*
 	 * If any async tasks have been requested, kick them off.
 	 */
 	spa_async_dispatch(spa);
 }
 
 /*
  * Sync all pools.  We don't want to hold the namespace lock across these
  * operations, so we take a reference on the spa_t and drop the lock during the
  * sync.
  */
 void
 spa_sync_allpools(void)
 {
 	spa_t *spa = NULL;
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa)) != NULL) {
 		if (spa_state(spa) != POOL_STATE_ACTIVE ||
 		    !spa_writeable(spa) || spa_suspended(spa))
 			continue;
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
 		txg_wait_synced(spa_get_dsl(spa), 0);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 taskq_t *
 spa_sync_tq_create(spa_t *spa, const char *name)
 {
 	kthread_t **kthreads;
 
 	ASSERT(spa->spa_sync_tq == NULL);
 	ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus);
 
 	/*
 	 * - do not allow more allocators than cpus.
 	 * - there may be more cpus than allocators.
 	 * - do not allow more sync taskq threads than allocators or cpus.
 	 */
 	int nthreads = spa->spa_alloc_count;
 	spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) *
 	    nthreads, KM_SLEEP);
 
 	spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri,
 	    nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads);
 	VERIFY(spa->spa_sync_tq != NULL);
 	VERIFY(kthreads != NULL);
 
 	spa_taskqs_t *tqs =
 	    &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE];
 
 	spa_syncthread_info_t *ti = spa->spa_syncthreads;
 	for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) {
 		ti->sti_thread = kthreads[i];
 		if (w == tqs->stqs_count) {
 			w = 0;
 		}
 		ti->sti_wr_iss_tq = tqs->stqs_taskq[w];
 	}
 
 	kmem_free(kthreads, sizeof (*kthreads) * nthreads);
 	return (spa->spa_sync_tq);
 }
 
 void
 spa_sync_tq_destroy(spa_t *spa)
 {
 	ASSERT(spa->spa_sync_tq != NULL);
 
 	taskq_wait(spa->spa_sync_tq);
 	taskq_destroy(spa->spa_sync_tq);
 	kmem_free(spa->spa_syncthreads,
 	    sizeof (spa_syncthread_info_t) * spa->spa_alloc_count);
 	spa->spa_sync_tq = NULL;
 }
 
 void
 spa_select_allocator(zio_t *zio)
 {
 	zbookmark_phys_t *bm = &zio->io_bookmark;
 	spa_t *spa = zio->io_spa;
 
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 
 	/*
 	 * A gang block (for example) may have inherited its parent's
 	 * allocator, in which case there is nothing further to do here.
 	 */
 	if (ZIO_HAS_ALLOCATOR(zio))
 		return;
 
 	ASSERT(spa != NULL);
 	ASSERT(bm != NULL);
 
 	/*
 	 * First try to use an allocator assigned to the syncthread, and set
 	 * the corresponding write issue taskq for the allocator.
 	 * Note, we must have an open pool to do this.
 	 */
 	if (spa->spa_sync_tq != NULL) {
 		spa_syncthread_info_t *ti = spa->spa_syncthreads;
 		for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
 			if (ti->sti_thread == curthread) {
 				zio->io_allocator = i;
 				zio->io_wr_iss_tq = ti->sti_wr_iss_tq;
 				return;
 			}
 		}
 	}
 
 	/*
 	 * We want to try to use as many allocators as possible to help improve
 	 * performance, but we also want logically adjacent IOs to be physically
 	 * adjacent to improve sequential read performance. We chunk each object
 	 * into 2^20 block regions, and then hash based on the objset, object,
 	 * level, and region to accomplish both of these goals.
 	 */
 	uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level,
 	    bm->zb_blkid >> 20);
 
 	zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
 	zio->io_wr_iss_tq = NULL;
 }
 
 /*
  * ==========================================================================
  * Miscellaneous routines
  * ==========================================================================
  */
 
 /*
  * Remove all pools in the system.
  */
 void
 spa_evict_all(void)
 {
 	spa_t *spa;
 
 	/*
 	 * Remove all cached state.  All pools should be closed now,
 	 * so every spa in the AVL tree should be unreferenced.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(NULL)) != NULL) {
 		/*
 		 * Stop async tasks.  The async thread may need to detach
 		 * a device that's been replaced, which requires grabbing
 		 * spa_namespace_lock, so we must drop it here.
 		 */
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
 		spa_async_suspend(spa);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
 
 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 			spa_unload(spa);
 			spa_deactivate(spa);
 		}
 		spa_remove(spa);
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 vdev_t *
 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
 {
 	vdev_t *vd;
 	int i;
 
 	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
 		return (vd);
 
 	if (aux) {
 		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
 			vd = spa->spa_l2cache.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
 
 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
 			vd = spa->spa_spares.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
 	}
 
 	return (NULL);
 }
 
 void
 spa_upgrade(spa_t *spa, uint64_t version)
 {
 	ASSERT(spa_writeable(spa));
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
 	 * This should only be called for a non-faulted pool, and since a
 	 * future version would result in an unopenable pool, this shouldn't be
 	 * possible.
 	 */
 	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
 	ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 }
 
 static boolean_t
 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav)
 {
 	(void) spa;
 	int i;
 	uint64_t vdev_guid;
 
 	for (i = 0; i < sav->sav_count; i++)
 		if (sav->sav_vdevs[i]->vdev_guid == guid)
 			return (B_TRUE);
 
 	for (i = 0; i < sav->sav_npending; i++) {
 		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
 		    &vdev_guid) == 0 && vdev_guid == guid)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 boolean_t
 spa_has_l2cache(spa_t *spa, uint64_t guid)
 {
 	return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache));
 }
 
 boolean_t
 spa_has_spare(spa_t *spa, uint64_t guid)
 {
 	return (spa_has_aux_vdev(spa, guid, &spa->spa_spares));
 }
 
 /*
  * Check if a pool has an active shared spare device.
  * Note: reference count of an active spare is 2, as a spare and as a replace
  */
 static boolean_t
 spa_has_active_shared_spare(spa_t *spa)
 {
 	int i, refcnt;
 	uint64_t pool;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 
 	for (i = 0; i < sav->sav_count; i++) {
 		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
 		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
 		    refcnt > 2)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 uint64_t
 spa_total_metaslabs(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	uint64_t m = 0;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 		if (!vdev_is_concrete(vd))
 			continue;
 		m += vd->vdev_ms_count;
 	}
 	return (m);
 }
 
 /*
  * Notify any waiting threads that some activity has switched from being in-
  * progress to not-in-progress so that the thread can wake up and determine
  * whether it is finished waiting.
  */
 void
 spa_notify_waiters(spa_t *spa)
 {
 	/*
 	 * Acquiring spa_activities_lock here prevents the cv_broadcast from
 	 * happening between the waiting thread's check and cv_wait.
 	 */
 	mutex_enter(&spa->spa_activities_lock);
 	cv_broadcast(&spa->spa_activities_cv);
 	mutex_exit(&spa->spa_activities_lock);
 }
 
 /*
  * Notify any waiting threads that the pool is exporting, and then block until
  * they are finished using the spa_t.
  */
 void
 spa_wake_waiters(spa_t *spa)
 {
 	mutex_enter(&spa->spa_activities_lock);
 	spa->spa_waiters_cancel = B_TRUE;
 	cv_broadcast(&spa->spa_activities_cv);
 	while (spa->spa_waiters != 0)
 		cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock);
 	spa->spa_waiters_cancel = B_FALSE;
 	mutex_exit(&spa->spa_activities_lock);
 }
 
 /* Whether the vdev or any of its descendants are being initialized/trimmed. */
 static boolean_t
 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER));
 	ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
 	ASSERT(activity == ZPOOL_WAIT_INITIALIZE ||
 	    activity == ZPOOL_WAIT_TRIM);
 
 	kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ?
 	    &vd->vdev_initialize_lock : &vd->vdev_trim_lock;
 
 	mutex_exit(&spa->spa_activities_lock);
 	mutex_enter(lock);
 	mutex_enter(&spa->spa_activities_lock);
 
 	boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ?
 	    (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) :
 	    (vd->vdev_trim_state == VDEV_TRIM_ACTIVE);
 	mutex_exit(lock);
 
 	if (in_progress)
 		return (B_TRUE);
 
 	for (int i = 0; i < vd->vdev_children; i++) {
 		if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i],
 		    activity))
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * If use_guid is true, this checks whether the vdev specified by guid is
  * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool
  * is being initialized/trimmed. The caller must hold the config lock and
  * spa_activities_lock.
  */
 static int
 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid,
     zpool_wait_activity_t activity, boolean_t *in_progress)
 {
 	mutex_exit(&spa->spa_activities_lock);
 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 	mutex_enter(&spa->spa_activities_lock);
 
 	vdev_t *vd;
 	if (use_guid) {
 		vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 		if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) {
 			spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 			return (EINVAL);
 		}
 	} else {
 		vd = spa->spa_root_vdev;
 	}
 
 	*in_progress = spa_vdev_activity_in_progress_impl(vd, activity);
 
 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 	return (0);
 }
 
 /*
  * Locking for waiting threads
  * ---------------------------
  *
  * Waiting threads need a way to check whether a given activity is in progress,
  * and then, if it is, wait for it to complete. Each activity will have some
  * in-memory representation of the relevant on-disk state which can be used to
  * determine whether or not the activity is in progress. The in-memory state and
  * the locking used to protect it will be different for each activity, and may
  * not be suitable for use with a cvar (e.g., some state is protected by the
  * config lock). To allow waiting threads to wait without any races, another
  * lock, spa_activities_lock, is used.
  *
  * When the state is checked, both the activity-specific lock (if there is one)
  * and spa_activities_lock are held. In some cases, the activity-specific lock
  * is acquired explicitly (e.g. the config lock). In others, the locking is
  * internal to some check (e.g. bpobj_is_empty). After checking, the waiting
  * thread releases the activity-specific lock and, if the activity is in
  * progress, then cv_waits using spa_activities_lock.
  *
  * The waiting thread is woken when another thread, one completing some
  * activity, updates the state of the activity and then calls
  * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only
  * needs to hold its activity-specific lock when updating the state, and this
  * lock can (but doesn't have to) be dropped before calling spa_notify_waiters.
  *
  * Because spa_notify_waiters acquires spa_activities_lock before broadcasting,
  * and because it is held when the waiting thread checks the state of the
  * activity, it can never be the case that the completing thread both updates
  * the activity state and cv_broadcasts in between the waiting thread's check
  * and cv_wait. Thus, a waiting thread can never miss a wakeup.
  *
  * In order to prevent deadlock, when the waiting thread does its check, in some
  * cases it will temporarily drop spa_activities_lock in order to acquire the
  * activity-specific lock. The order in which spa_activities_lock and the
  * activity specific lock are acquired in the waiting thread is determined by
  * the order in which they are acquired in the completing thread; if the
  * completing thread calls spa_notify_waiters with the activity-specific lock
  * held, then the waiting thread must also acquire the activity-specific lock
  * first.
  */
 
 static int
 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
     boolean_t use_tag, uint64_t tag, boolean_t *in_progress)
 {
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
 
 	switch (activity) {
 	case ZPOOL_WAIT_CKPT_DISCARD:
 		*in_progress =
 		    (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) &&
 		    zap_contains(spa_meta_objset(spa),
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) ==
 		    ENOENT);
 		break;
 	case ZPOOL_WAIT_FREE:
 		*in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS &&
 		    !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) ||
 		    spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) ||
 		    spa_livelist_delete_check(spa));
 		break;
 	case ZPOOL_WAIT_INITIALIZE:
 	case ZPOOL_WAIT_TRIM:
 		error = spa_vdev_activity_in_progress(spa, use_tag, tag,
 		    activity, in_progress);
 		break;
 	case ZPOOL_WAIT_REPLACE:
 		mutex_exit(&spa->spa_activities_lock);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 		mutex_enter(&spa->spa_activities_lock);
 
 		*in_progress = vdev_replace_in_progress(spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		break;
 	case ZPOOL_WAIT_REMOVE:
 		*in_progress = (spa->spa_removing_phys.sr_state ==
 		    DSS_SCANNING);
 		break;
 	case ZPOOL_WAIT_RESILVER:
 		*in_progress = vdev_rebuild_active(spa->spa_root_vdev);
 		if (*in_progress)
 			break;
 		zfs_fallthrough;
 	case ZPOOL_WAIT_SCRUB:
 	{
 		boolean_t scanning, paused, is_scrub;
 		dsl_scan_t *scn =  spa->spa_dsl_pool->dp_scan;
 
 		is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB);
 		scanning = (scn->scn_phys.scn_state == DSS_SCANNING);
 		paused = dsl_scan_is_paused_scrub(scn);
 		*in_progress = (scanning && !paused &&
 		    is_scrub == (activity == ZPOOL_WAIT_SCRUB));
 		break;
 	}
 	case ZPOOL_WAIT_RAIDZ_EXPAND:
 	{
 		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 		*in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING);
 		break;
 	}
 	default:
 		panic("unrecognized value for activity %d", activity);
 	}
 
 	return (error);
 }
 
 static int
 spa_wait_common(const char *pool, zpool_wait_activity_t activity,
     boolean_t use_tag, uint64_t tag, boolean_t *waited)
 {
 	/*
 	 * The tag is used to distinguish between instances of an activity.
 	 * 'initialize' and 'trim' are the only activities that we use this for.
 	 * The other activities can only have a single instance in progress in a
 	 * pool at one time, making the tag unnecessary.
 	 *
 	 * There can be multiple devices being replaced at once, but since they
 	 * all finish once resilvering finishes, we don't bother keeping track
 	 * of them individually, we just wait for them all to finish.
 	 */
 	if (use_tag && activity != ZPOOL_WAIT_INITIALIZE &&
 	    activity != ZPOOL_WAIT_TRIM)
 		return (EINVAL);
 
 	if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES)
 		return (EINVAL);
 
 	spa_t *spa;
 	int error = spa_open(pool, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Increment the spa's waiter count so that we can call spa_close and
 	 * still ensure that the spa_t doesn't get freed before this thread is
 	 * finished with it when the pool is exported. We want to call spa_close
 	 * before we start waiting because otherwise the additional ref would
 	 * prevent the pool from being exported or destroyed throughout the
 	 * potentially long wait.
 	 */
 	mutex_enter(&spa->spa_activities_lock);
 	spa->spa_waiters++;
 	spa_close(spa, FTAG);
 
 	*waited = B_FALSE;
 	for (;;) {
 		boolean_t in_progress;
 		error = spa_activity_in_progress(spa, activity, use_tag, tag,
 		    &in_progress);
 
 		if (error || !in_progress || spa->spa_waiters_cancel)
 			break;
 
 		*waited = B_TRUE;
 
 		if (cv_wait_sig(&spa->spa_activities_cv,
 		    &spa->spa_activities_lock) == 0) {
 			error = EINTR;
 			break;
 		}
 	}
 
 	spa->spa_waiters--;
 	cv_signal(&spa->spa_waiters_cv);
 	mutex_exit(&spa->spa_activities_lock);
 
 	return (error);
 }
 
 /*
  * Wait for a particular instance of the specified activity to complete, where
  * the instance is identified by 'tag'
  */
 int
 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag,
     boolean_t *waited)
 {
 	return (spa_wait_common(pool, activity, B_TRUE, tag, waited));
 }
 
 /*
  * Wait for all instances of the specified activity complete
  */
 int
 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited)
 {
 
 	return (spa_wait_common(pool, activity, B_FALSE, 0, waited));
 }
 
 sysevent_t *
 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
 {
 	sysevent_t *ev = NULL;
 #ifdef _KERNEL
 	nvlist_t *resource;
 
 	resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl);
 	if (resource) {
 		ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP);
 		ev->resource = resource;
 	}
 #else
 	(void) spa, (void) vd, (void) hist_nvl, (void) name;
 #endif
 	return (ev);
 }
 
 void
 spa_event_post(sysevent_t *ev)
 {
 #ifdef _KERNEL
 	if (ev) {
 		zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb);
 		kmem_free(ev, sizeof (*ev));
 	}
 #else
 	(void) ev;
 #endif
 }
 
 /*
  * Post a zevent corresponding to the given sysevent.   The 'name' must be one
  * of the event definitions in sys/sysevent/eventdefs.h.  The payload will be
  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
  * in the userland libzpool, as we don't want consumers to misinterpret ztest
  * or zdb as real changes.
  */
 void
 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
 {
 	spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
 }
 
 /* state manipulation functions */
 EXPORT_SYMBOL(spa_open);
 EXPORT_SYMBOL(spa_open_rewind);
 EXPORT_SYMBOL(spa_get_stats);
 EXPORT_SYMBOL(spa_create);
 EXPORT_SYMBOL(spa_import);
 EXPORT_SYMBOL(spa_tryimport);
 EXPORT_SYMBOL(spa_destroy);
 EXPORT_SYMBOL(spa_export);
 EXPORT_SYMBOL(spa_reset);
 EXPORT_SYMBOL(spa_async_request);
 EXPORT_SYMBOL(spa_async_suspend);
 EXPORT_SYMBOL(spa_async_resume);
 EXPORT_SYMBOL(spa_inject_addref);
 EXPORT_SYMBOL(spa_inject_delref);
 EXPORT_SYMBOL(spa_scan_stat_init);
 EXPORT_SYMBOL(spa_scan_get_stats);
 
 /* device manipulation */
 EXPORT_SYMBOL(spa_vdev_add);
 EXPORT_SYMBOL(spa_vdev_attach);
 EXPORT_SYMBOL(spa_vdev_detach);
 EXPORT_SYMBOL(spa_vdev_setpath);
 EXPORT_SYMBOL(spa_vdev_setfru);
 EXPORT_SYMBOL(spa_vdev_split_mirror);
 
 /* spare statech is global across all pools) */
 EXPORT_SYMBOL(spa_spare_add);
 EXPORT_SYMBOL(spa_spare_remove);
 EXPORT_SYMBOL(spa_spare_exists);
 EXPORT_SYMBOL(spa_spare_activate);
 
 /* L2ARC statech is global across all pools) */
 EXPORT_SYMBOL(spa_l2cache_add);
 EXPORT_SYMBOL(spa_l2cache_remove);
 EXPORT_SYMBOL(spa_l2cache_exists);
 EXPORT_SYMBOL(spa_l2cache_activate);
 EXPORT_SYMBOL(spa_l2cache_drop);
 
 /* scanning */
 EXPORT_SYMBOL(spa_scan);
 EXPORT_SYMBOL(spa_scan_stop);
 
 /* spa syncing */
 EXPORT_SYMBOL(spa_sync); /* only for DMU use */
 EXPORT_SYMBOL(spa_sync_allpools);
 
 /* properties */
 EXPORT_SYMBOL(spa_prop_set);
 EXPORT_SYMBOL(spa_prop_get);
 EXPORT_SYMBOL(spa_prop_clear_bootfs);
 
 /* asynchronous event notification */
 EXPORT_SYMBOL(spa_event_notify);
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW,
 	"Percentage of CPUs to run a metaslab preload taskq");
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW,
 	"log2 fraction of arc that can be used by inflight I/Os when "
 	"verifying pool during import");
 /* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW,
 	"Set to traverse metadata on pool import");
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
 	"Set to traverse data on pool import");
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
 	"Print vdev tree to zfs_dbgmsg during pool import");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
 	"Percentage of CPUs to run an IO worker thread");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD,
 	"Number of threads per IO worker taskqueue");
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW,
 	"Allow importing pool with up to this number of missing top-level "
 	"vdevs (in read-only mode)");
 /* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT,
 	ZMOD_RW, "Set the livelist condense zthr to pause");
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT,
 	ZMOD_RW, "Set the livelist condense synctask to pause");
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel,
 	INT, ZMOD_RW,
 	"Whether livelist condensing was canceled in the synctask");
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel,
 	INT, ZMOD_RW,
 	"Whether livelist condensing was canceled in the zthr function");
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
 	ZMOD_RW,
 	"Whether extra ALLOC blkptrs were added to a livelist entry while it "
 	"was being condensed");
 
 #ifdef _KERNEL
 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
 	spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD,
 	"Configure IO queues for read IO");
 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
 	spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD,
 	"Configure IO queues for write IO");
 #endif
 /* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW,
 	"Number of CPUs to run write issue taskqs");
diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c
index 244b4d264212..62d7b4fa2df2 100644
--- a/module/zfs/spa_errlog.c
+++ b/module/zfs/spa_errlog.c
@@ -1,1500 +1,1490 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, 2014, Delphix. All rights reserved.
  * Copyright (c) 2019 Datto Inc.
  * Copyright (c) 2021, 2022, George Amanakis. All rights reserved.
  */
 
 /*
  * Routines to manage the on-disk persistent error log.
  *
  * Each pool stores a log of all logical data errors seen during normal
  * operation.  This is actually the union of two distinct logs: the last log,
  * and the current log.  All errors seen are logged to the current log.  When a
  * scrub completes, the current log becomes the last log, the last log is thrown
  * out, and the current log is reinitialized.  This way, if an error is somehow
  * corrected, a new scrub will show that it no longer exists, and will be
  * deleted from the log when the scrub completes.
  *
  * The log is stored using a ZAP object whose key is a string form of the
  * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an
  * optional 'objset:object' human-readable string describing the data.  When an
  * error is first logged, this string will be empty, indicating that no name is
  * known.  This prevents us from having to issue a potentially large amount of
  * I/O to discover the object name during an error path.  Instead, we do the
  * calculation when the data is requested, storing the result so future queries
  * will be faster.
  *
  * If the head_errlog feature is enabled, a different on-disk format is used.
  * The error log of each head dataset is stored separately in the zap object
  * and keyed by the head id. This enables listing every dataset affected in
  * userland. In order to be able to track whether an error block has been
  * modified or added to snapshots since it was marked as an error, a new tuple
  * is introduced: zbookmark_err_phys_t. It allows the storage of the birth
  * transaction group of an error block on-disk. The birth transaction group is
  * used by check_filesystem() to assess whether this block was freed,
  * re-written or added to a snapshot since its marking as an error.
  *
  * This log is then shipped into an nvlist where the key is the dataset name and
  * the value is the object name.  Userland is then responsible for uniquifying
  * this list and displaying it to the user.
  */
 
 #include <sys/dmu_tx.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/dsl_dir.h>
 #include <sys/dmu_objset.h>
 #include <sys/dbuf.h>
 #include <sys/zfs_znode.h>
 
 #define	NAME_MAX_LEN 64
 
 typedef struct clones {
 	uint64_t clone_ds;
 	list_node_t node;
 } clones_t;
 
 /*
  * spa_upgrade_errlog_limit : A zfs module parameter that controls the number
  *		of on-disk error log entries that will be converted to the new
  *		format when enabling head_errlog. Defaults to 0 which converts
  *		all log entries.
  */
 static uint_t spa_upgrade_errlog_limit = 0;
 
 /*
  * Convert a bookmark to a string.
  */
 static void
 bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len)
 {
 	(void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
 	    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
 	    (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
 }
 
 /*
  * Convert an err_phys to a string.
  */
 static void
 errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len)
 {
 	(void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
 	    (u_longlong_t)zep->zb_object, (u_longlong_t)zep->zb_level,
 	    (u_longlong_t)zep->zb_blkid, (u_longlong_t)zep->zb_birth);
 }
 
 /*
  * Convert a string to a err_phys.
  */
 void
 name_to_errphys(char *buf, zbookmark_err_phys_t *zep)
 {
 	zep->zb_object = zfs_strtonum(buf, &buf);
 	ASSERT(*buf == ':');
 	zep->zb_level = (int)zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == ':');
 	zep->zb_blkid = zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == ':');
 	zep->zb_birth = zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == '\0');
 }
 
 /*
  * Convert a string to a bookmark.
  */
 static void
 name_to_bookmark(char *buf, zbookmark_phys_t *zb)
 {
 	zb->zb_objset = zfs_strtonum(buf, &buf);
 	ASSERT(*buf == ':');
 	zb->zb_object = zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == ':');
 	zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == ':');
 	zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == '\0');
 }
 
 void
 zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb)
 {
 	zb->zb_objset = dataset;
 	zb->zb_object = zep->zb_object;
 	zb->zb_level = zep->zb_level;
 	zb->zb_blkid = zep->zb_blkid;
 }
 
 static void
 name_to_object(char *buf, uint64_t *obj)
 {
 	*obj = zfs_strtonum(buf, &buf);
 	ASSERT(*buf == '\0');
 }
 
 /*
  * Retrieve the head filesystem.
  */
 static int get_head_ds(spa_t *spa, uint64_t dsobj, uint64_t *head_ds)
 {
 	dsl_dataset_t *ds;
 	int error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool,
 	    dsobj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
 
 	if (error != 0)
 		return (error);
 
 	ASSERT(head_ds);
 	*head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
 	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 
 	return (error);
 }
 
 /*
  * Log an uncorrectable error to the persistent error log.  We add it to the
  * spa's list of pending errors.  The changes are actually synced out to disk
  * during spa_errlog_sync().
  */
 void
-spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth)
+spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t birth)
 {
 	spa_error_entry_t search;
 	spa_error_entry_t *new;
 	avl_tree_t *tree;
 	avl_index_t where;
 
 	/*
 	 * If we are trying to import a pool, ignore any errors, as we won't be
 	 * writing to the pool any time soon.
 	 */
 	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
 		return;
 
 	mutex_enter(&spa->spa_errlist_lock);
 
 	/*
 	 * If we have had a request to rotate the log, log it to the next list
 	 * instead of the current one.
 	 */
 	if (spa->spa_scrub_active || spa->spa_scrub_finished)
 		tree = &spa->spa_errlist_scrub;
 	else
 		tree = &spa->spa_errlist_last;
 
 	search.se_bookmark = *zb;
 	if (avl_find(tree, &search, &where) != NULL) {
 		mutex_exit(&spa->spa_errlist_lock);
 		return;
 	}
 
 	new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
 	new->se_bookmark = *zb;
 
 	/*
 	 * If the head_errlog feature is enabled, store the birth txg now. In
 	 * case the file is deleted before spa_errlog_sync() runs, we will not
 	 * be able to retrieve the birth txg.
 	 */
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		new->se_zep.zb_object = zb->zb_object;
 		new->se_zep.zb_level = zb->zb_level;
 		new->se_zep.zb_blkid = zb->zb_blkid;
-
-		/*
-		 * birth may end up being NULL, e.g. in zio_done(). We
-		 * will handle this in process_error_block().
-		 */
-		if (birth != NULL)
-			new->se_zep.zb_birth = *birth;
+		new->se_zep.zb_birth = birth;
 	}
 
 	avl_insert(tree, new, where);
 	mutex_exit(&spa->spa_errlist_lock);
 }
 
 int
 find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
     uint64_t *birth_txg)
 {
 	objset_t *os;
 	int error = dmu_objset_from_ds(ds, &os);
 	if (error != 0)
 		return (error);
 
 	dnode_t *dn;
 	blkptr_t bp;
 
 	error = dnode_hold(os, zep->zb_object, FTAG, &dn);
 	if (error != 0)
 		return (error);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL,
 	    NULL);
 	if (error == 0 && BP_IS_HOLE(&bp))
 		error = SET_ERROR(ENOENT);
 
-	*birth_txg = bp.blk_birth;
+	*birth_txg = BP_GET_LOGICAL_BIRTH(&bp);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (error);
 }
 
 /*
  * This function finds the oldest affected filesystem containing an error
  * block.
  */
 int
 find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
     uint64_t *top_affected_fs)
 {
 	uint64_t oldest_dsobj;
 	int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth,
 	    &oldest_dsobj);
 	if (error != 0)
 		return (error);
 
 	dsl_dataset_t *ds;
 	error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj,
 	    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	*top_affected_fs =
 	    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
 	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 	return (0);
 }
 
 
 #ifdef _KERNEL
 /*
  * Copy the bookmark to the end of the user-space buffer which starts at
  * uaddr and has *count unused entries, and decrement *count by 1.
  */
 static int
 copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count)
 {
 	if (*count == 0)
 		return (SET_ERROR(ENOMEM));
 
 	*count -= 1;
 	if (copyout(zb, (char *)uaddr + (*count) * sizeof (zbookmark_phys_t),
 	    sizeof (zbookmark_phys_t)) != 0)
 		return (SET_ERROR(EFAULT));
 	return (0);
 }
 
 /*
  * Each time the error block is referenced by a snapshot or clone, add a
  * zbookmark_phys_t entry to the userspace array at uaddr. The array is
  * filled from the back and the in-out parameter *count is modified to be the
  * number of unused entries at the beginning of the array. The function
  * scrub_filesystem() is modelled after this one.
  */
 static int
 check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
     void *uaddr, uint64_t *count, list_t *clones_list)
 {
 	dsl_dataset_t *ds;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 
 	int error = dsl_dataset_hold_obj_flags(dp, head_ds,
 	    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	uint64_t latest_txg;
 	uint64_t txg_to_consider = spa->spa_syncing_txg;
 	boolean_t check_snapshot = B_TRUE;
 	error = find_birth_txg(ds, zep, &latest_txg);
 
 	/*
 	 * If find_birth_txg() errors out otherwise, let txg_to_consider be
 	 * equal to the spa's syncing txg: if check_filesystem() errors out
 	 * then affected snapshots or clones will not be checked.
 	 */
 	if (error == 0 && zep->zb_birth == latest_txg) {
 		/* Block neither free nor rewritten. */
 		zbookmark_phys_t zb;
 		zep_to_zb(head_ds, zep, &zb);
 		error = copyout_entry(&zb, uaddr, count);
 		if (error != 0) {
 			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 			return (error);
 		}
 		check_snapshot = B_FALSE;
 	} else if (error == 0) {
 		txg_to_consider = latest_txg;
 	}
 
 	/*
 	 * Retrieve the number of snapshots if the dataset is not a snapshot.
 	 */
 	uint64_t snap_count = 0;
 	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
 
 		error = zap_count(spa->spa_meta_objset,
 		    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
 
 		if (error != 0) {
 			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 			return (error);
 		}
 	}
 
 	if (snap_count == 0) {
 		/* Filesystem without snapshots. */
 		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 		return (0);
 	}
 
 	uint64_t *snap_obj_array = kmem_zalloc(snap_count * sizeof (uint64_t),
 	    KM_SLEEP);
 
 	int aff_snap_count = 0;
 	uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 	uint64_t zap_clone = dsl_dir_phys(ds->ds_dir)->dd_clones;
 
 	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 
 	/* Check only snapshots created from this file system. */
 	while (snap_obj != 0 && zep->zb_birth < snap_obj_txg &&
 	    snap_obj_txg <= txg_to_consider) {
 
 		error = dsl_dataset_hold_obj_flags(dp, snap_obj,
 		    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
 		if (error != 0)
 			goto out;
 
 		if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) {
 			snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 			snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 			continue;
 		}
 
 		boolean_t affected = B_TRUE;
 		if (check_snapshot) {
 			uint64_t blk_txg;
 			error = find_birth_txg(ds, zep, &blk_txg);
 			affected = (error == 0 && zep->zb_birth == blk_txg);
 		}
 
 		/* Report errors in snapshots. */
 		if (affected) {
 			snap_obj_array[aff_snap_count] = snap_obj;
 			aff_snap_count++;
 
 			zbookmark_phys_t zb;
 			zep_to_zb(snap_obj, zep, &zb);
 			error = copyout_entry(&zb, uaddr, count);
 			if (error != 0) {
 				dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT,
 				    FTAG);
 				goto out;
 			}
 		}
 		snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 	}
 
 	if (zap_clone == 0 || aff_snap_count == 0) {
 		error = 0;
 		goto out;
 	}
 
 	/* Check clones. */
 	zap_cursor_t *zc;
 	zap_attribute_t *za;
 
 	zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
 	za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
 
 	for (zap_cursor_init(zc, spa->spa_meta_objset, zap_clone);
 	    zap_cursor_retrieve(zc, za) == 0;
 	    zap_cursor_advance(zc)) {
 
 		dsl_dataset_t *clone;
 		error = dsl_dataset_hold_obj_flags(dp, za->za_first_integer,
 		    DS_HOLD_FLAG_DECRYPT, FTAG, &clone);
 
 		if (error != 0)
 			break;
 
 		/*
 		 * Only clones whose origins were affected could also
 		 * have affected snapshots.
 		 */
 		boolean_t found = B_FALSE;
 		for (int i = 0; i < snap_count; i++) {
 			if (dsl_dir_phys(clone->ds_dir)->dd_origin_obj
 			    == snap_obj_array[i])
 				found = B_TRUE;
 		}
 		dsl_dataset_rele_flags(clone, DS_HOLD_FLAG_DECRYPT, FTAG);
 
 		if (!found)
 			continue;
 
 		clones_t *ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
 		ct->clone_ds = za->za_first_integer;
 		list_insert_tail(clones_list, ct);
 	}
 
 	zap_cursor_fini(zc);
 	kmem_free(za, sizeof (*za));
 	kmem_free(zc, sizeof (*zc));
 
 out:
 	kmem_free(snap_obj_array, sizeof (*snap_obj_array));
 	return (error);
 }
 
 static int
 process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
     void *uaddr, uint64_t *count)
 {
 	/*
 	 * If zb_birth == 0 or head_ds == 0 it means we failed to retrieve the
 	 * birth txg or the head filesystem of the block pointer. This may
 	 * happen e.g. when an encrypted filesystem is not mounted or when
 	 * the key is not loaded. In this case do not proceed to
 	 * check_filesystem(), instead do the accounting here.
 	 */
 	if (zep->zb_birth == 0 || head_ds == 0) {
 		zbookmark_phys_t zb;
 		zep_to_zb(head_ds, zep, &zb);
 		int error = copyout_entry(&zb, uaddr, count);
 		if (error != 0) {
 			return (error);
 		}
 		return (0);
 	}
 
 	uint64_t top_affected_fs;
 	uint64_t init_count = *count;
 	int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs);
 	if (error == 0) {
 		clones_t *ct;
 		list_t clones_list;
 
 		list_create(&clones_list, sizeof (clones_t),
 		    offsetof(clones_t, node));
 
 		error = check_filesystem(spa, top_affected_fs, zep,
 		    uaddr, count, &clones_list);
 
 		while ((ct = list_remove_head(&clones_list)) != NULL) {
 			error = check_filesystem(spa, ct->clone_ds, zep,
 			    uaddr, count, &clones_list);
 			kmem_free(ct, sizeof (*ct));
 
 			if (error) {
 				while (!list_is_empty(&clones_list)) {
 					ct = list_remove_head(&clones_list);
 					kmem_free(ct, sizeof (*ct));
 				}
 				break;
 			}
 		}
 
 		list_destroy(&clones_list);
 	}
 	if (error == 0 && init_count == *count) {
 		/*
 		 * If we reach this point, no errors have been detected
 		 * in the checked filesystems/snapshots. Before returning mark
 		 * the error block to be removed from the error lists and logs.
 		 */
 		zbookmark_phys_t zb;
 		zep_to_zb(head_ds, zep, &zb);
-		spa_remove_error(spa, &zb, &zep->zb_birth);
+		spa_remove_error(spa, &zb, zep->zb_birth);
 	}
 
 	return (error);
 }
 #endif
 
 /* Return the number of errors in the error log */
 uint64_t
 spa_get_last_errlog_size(spa_t *spa)
 {
 	uint64_t total = 0, count;
 	mutex_enter(&spa->spa_errlog_lock);
 
 	if (spa->spa_errlog_last != 0 &&
 	    zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
 	    &count) == 0)
 		total += count;
 	mutex_exit(&spa->spa_errlog_lock);
 	return (total);
 }
 
 /*
  * If a healed bookmark matches an entry in the error log we stash it in a tree
  * so that we can later remove the related log entries in sync context.
  */
 static void
 spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb,
-    const uint64_t *birth)
+    const uint64_t birth)
 {
 	char name[NAME_MAX_LEN];
 
 	if (obj == 0)
 		return;
 
 	boolean_t held_list = B_FALSE;
 	boolean_t held_log = B_FALSE;
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		bookmark_to_name(healed_zb, name, sizeof (name));
 
 		if (zap_contains(spa->spa_meta_objset, healed_zb->zb_objset,
 		    name) == 0) {
 			if (!MUTEX_HELD(&spa->spa_errlog_lock)) {
 				mutex_enter(&spa->spa_errlog_lock);
 				held_log = B_TRUE;
 			}
 
 			/*
 			 * Found an error matching healed zb, add zb to our
 			 * tree of healed errors
 			 */
 			avl_tree_t *tree = &spa->spa_errlist_healed;
 			spa_error_entry_t search;
 			spa_error_entry_t *new;
 			avl_index_t where;
 			search.se_bookmark = *healed_zb;
 			if (!MUTEX_HELD(&spa->spa_errlist_lock)) {
 				mutex_enter(&spa->spa_errlist_lock);
 				held_list = B_TRUE;
 			}
 			if (avl_find(tree, &search, &where) != NULL) {
 				if (held_list)
 					mutex_exit(&spa->spa_errlist_lock);
 				if (held_log)
 					mutex_exit(&spa->spa_errlog_lock);
 				return;
 			}
 			new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
 			new->se_bookmark = *healed_zb;
 			avl_insert(tree, new, where);
 			if (held_list)
 				mutex_exit(&spa->spa_errlist_lock);
 			if (held_log)
 				mutex_exit(&spa->spa_errlog_lock);
 		}
 		return;
 	}
 
 	zbookmark_err_phys_t healed_zep;
 	healed_zep.zb_object = healed_zb->zb_object;
 	healed_zep.zb_level = healed_zb->zb_level;
 	healed_zep.zb_blkid = healed_zb->zb_blkid;
-
-	if (birth != NULL)
-		healed_zep.zb_birth = *birth;
-	else
-		healed_zep.zb_birth = 0;
+	healed_zep.zb_birth = birth;
 
 	errphys_to_name(&healed_zep, name, sizeof (name));
 
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_errlog_last);
 	    zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
 		if (zap_contains(spa->spa_meta_objset, za.za_first_integer,
 		    name) == 0) {
 			if (!MUTEX_HELD(&spa->spa_errlog_lock)) {
 				mutex_enter(&spa->spa_errlog_lock);
 				held_log = B_TRUE;
 			}
 
 			avl_tree_t *tree = &spa->spa_errlist_healed;
 			spa_error_entry_t search;
 			spa_error_entry_t *new;
 			avl_index_t where;
 			search.se_bookmark = *healed_zb;
 
 			if (!MUTEX_HELD(&spa->spa_errlist_lock)) {
 				mutex_enter(&spa->spa_errlist_lock);
 				held_list = B_TRUE;
 			}
 
 			if (avl_find(tree, &search, &where) != NULL) {
 				if (held_list)
 					mutex_exit(&spa->spa_errlist_lock);
 				if (held_log)
 					mutex_exit(&spa->spa_errlog_lock);
 				continue;
 			}
 			new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
 			new->se_bookmark = *healed_zb;
 			new->se_zep = healed_zep;
 			avl_insert(tree, new, where);
 
 			if (held_list)
 				mutex_exit(&spa->spa_errlist_lock);
 			if (held_log)
 				mutex_exit(&spa->spa_errlog_lock);
 		}
 	}
 	zap_cursor_fini(&zc);
 }
 
 /*
  * If this error exists in the given tree remove it.
  */
 static void
 remove_error_from_list(spa_t *spa, avl_tree_t *t, const zbookmark_phys_t *zb)
 {
 	spa_error_entry_t search, *found;
 	avl_index_t where;
 
 	mutex_enter(&spa->spa_errlist_lock);
 	search.se_bookmark = *zb;
 	if ((found = avl_find(t, &search, &where)) != NULL) {
 		avl_remove(t, found);
 		kmem_free(found, sizeof (spa_error_entry_t));
 	}
 	mutex_exit(&spa->spa_errlist_lock);
 }
 
 
 /*
  * Removes all of the recv healed errors from both on-disk error logs
  */
 static void
 spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx)
 {
 	char name[NAME_MAX_LEN];
 	spa_error_entry_t *se;
 	void *cookie = NULL;
 
 	ASSERT(MUTEX_HELD(&spa->spa_errlog_lock));
 
 	while ((se = avl_destroy_nodes(&spa->spa_errlist_healed,
 	    &cookie)) != NULL) {
 		remove_error_from_list(spa, s, &se->se_bookmark);
 		remove_error_from_list(spa, l, &se->se_bookmark);
 
 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 			bookmark_to_name(&se->se_bookmark, name, sizeof (name));
 			(void) zap_remove(spa->spa_meta_objset,
 			    spa->spa_errlog_last, name, tx);
 			(void) zap_remove(spa->spa_meta_objset,
 			    spa->spa_errlog_scrub, name, tx);
 		} else {
 			errphys_to_name(&se->se_zep, name, sizeof (name));
 			zap_cursor_t zc;
 			zap_attribute_t za;
 			for (zap_cursor_init(&zc, spa->spa_meta_objset,
 			    spa->spa_errlog_last);
 			    zap_cursor_retrieve(&zc, &za) == 0;
 			    zap_cursor_advance(&zc)) {
 				zap_remove(spa->spa_meta_objset,
 				    za.za_first_integer, name, tx);
 			}
 			zap_cursor_fini(&zc);
 
 			for (zap_cursor_init(&zc, spa->spa_meta_objset,
 			    spa->spa_errlog_scrub);
 			    zap_cursor_retrieve(&zc, &za) == 0;
 			    zap_cursor_advance(&zc)) {
 				zap_remove(spa->spa_meta_objset,
 				    za.za_first_integer, name, tx);
 			}
 			zap_cursor_fini(&zc);
 		}
 		kmem_free(se, sizeof (spa_error_entry_t));
 	}
 }
 
 /*
  * Stash away healed bookmarks to remove them from the on-disk error logs
  * later in spa_remove_healed_errors().
  */
 void
-spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, const uint64_t *birth)
+spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, uint64_t birth)
 {
 	spa_add_healed_error(spa, spa->spa_errlog_last, zb, birth);
 	spa_add_healed_error(spa, spa->spa_errlog_scrub, zb, birth);
 }
 
 static uint64_t
 approx_errlog_size_impl(spa_t *spa, uint64_t spa_err_obj)
 {
 	if (spa_err_obj == 0)
 		return (0);
 	uint64_t total = 0;
 
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
 	    zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
 		uint64_t count;
 		if (zap_count(spa->spa_meta_objset, za.za_first_integer,
 		    &count) == 0)
 			total += count;
 	}
 	zap_cursor_fini(&zc);
 	return (total);
 }
 
 /*
  * Return the approximate number of errors currently in the error log.  This
  * will be nonzero if there are some errors, but otherwise it may be more
  * or less than the number of entries returned by spa_get_errlog().
  */
 uint64_t
 spa_approx_errlog_size(spa_t *spa)
 {
 	uint64_t total = 0;
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		mutex_enter(&spa->spa_errlog_lock);
 		uint64_t count;
 		if (spa->spa_errlog_scrub != 0 &&
 		    zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
 		    &count) == 0)
 			total += count;
 
 		if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
 		    zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
 		    &count) == 0)
 			total += count;
 		mutex_exit(&spa->spa_errlog_lock);
 
 	} else {
 		mutex_enter(&spa->spa_errlog_lock);
 		total += approx_errlog_size_impl(spa, spa->spa_errlog_last);
 		total += approx_errlog_size_impl(spa, spa->spa_errlog_scrub);
 		mutex_exit(&spa->spa_errlog_lock);
 	}
 	mutex_enter(&spa->spa_errlist_lock);
 	total += avl_numnodes(&spa->spa_errlist_last);
 	total += avl_numnodes(&spa->spa_errlist_scrub);
 	mutex_exit(&spa->spa_errlist_lock);
 	return (total);
 }
 
 /*
  * This function sweeps through an on-disk error log and stores all bookmarks
  * as error bookmarks in a new ZAP object. At the end we discard the old one,
  * and spa_update_errlog() will set the spa's on-disk error log to new ZAP
  * object.
  */
 static void
 sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj,
     dmu_tx_t *tx)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	zbookmark_phys_t zb;
 	uint64_t count;
 
 	*newobj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG,
 	    DMU_OT_NONE, 0, tx);
 
 	/*
 	 * If we cannnot perform the upgrade we should clear the old on-disk
 	 * error logs.
 	 */
 	if (zap_count(spa->spa_meta_objset, spa_err_obj, &count) != 0) {
 		VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
 		return;
 	}
 
 	for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		if (spa_upgrade_errlog_limit != 0 &&
 		    zc.zc_cd == spa_upgrade_errlog_limit)
 			break;
 
 		name_to_bookmark(za.za_name, &zb);
 
 		zbookmark_err_phys_t zep;
 		zep.zb_object = zb.zb_object;
 		zep.zb_level = zb.zb_level;
 		zep.zb_blkid = zb.zb_blkid;
 		zep.zb_birth = 0;
 
 		/*
 		 * In case of an error we should simply continue instead of
 		 * returning prematurely. See the next comment.
 		 */
 		uint64_t head_ds;
 		dsl_pool_t *dp = spa->spa_dsl_pool;
 		dsl_dataset_t *ds;
 		objset_t *os;
 
 		int error = dsl_dataset_hold_obj_flags(dp, zb.zb_objset,
 		    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
 		if (error != 0)
 			continue;
 
 		head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
 
 		/*
 		 * The objset and the dnode are required for getting the block
 		 * pointer, which is used to determine if BP_IS_HOLE(). If
 		 * getting the objset or the dnode fails, do not create a
 		 * zap entry (presuming we know the dataset) as this may create
 		 * spurious errors that we cannot ever resolve. If an error is
 		 * truly persistent, it should re-appear after a scan.
 		 */
 		if (dmu_objset_from_ds(ds, &os) != 0) {
 			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 			continue;
 		}
 
 		dnode_t *dn;
 		blkptr_t bp;
 
 		if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) {
 			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 			continue;
 		}
 
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		error = dbuf_dnode_findbp(dn, zep.zb_level, zep.zb_blkid, &bp,
 		    NULL, NULL);
 		if (error == EACCES)
 			error = 0;
 		else if (!error)
-			zep.zb_birth = bp.blk_birth;
+			zep.zb_birth = BP_GET_LOGICAL_BIRTH(&bp);
 
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_rele(dn, FTAG);
 		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 
 		if (error != 0 || BP_IS_HOLE(&bp))
 			continue;
 
 		uint64_t err_obj;
 		error = zap_lookup_int_key(spa->spa_meta_objset, *newobj,
 		    head_ds, &err_obj);
 
 		if (error == ENOENT) {
 			err_obj = zap_create(spa->spa_meta_objset,
 			    DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
 
 			(void) zap_update_int_key(spa->spa_meta_objset,
 			    *newobj, head_ds, err_obj, tx);
 		}
 
 		char buf[64];
 		errphys_to_name(&zep, buf, sizeof (buf));
 
 		const char *name = "";
 		(void) zap_update(spa->spa_meta_objset, err_obj,
 		    buf, 1, strlen(name) + 1, name, tx);
 	}
 	zap_cursor_fini(&zc);
 
 	VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
 }
 
 void
 spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx)
 {
 	uint64_t newobj = 0;
 
 	mutex_enter(&spa->spa_errlog_lock);
 	if (spa->spa_errlog_last != 0) {
 		sync_upgrade_errlog(spa, spa->spa_errlog_last, &newobj, tx);
 		spa->spa_errlog_last = newobj;
 
 		(void) zap_update(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
 		    sizeof (uint64_t), 1, &spa->spa_errlog_last, tx);
 	}
 
 	if (spa->spa_errlog_scrub != 0) {
 		sync_upgrade_errlog(spa, spa->spa_errlog_scrub, &newobj, tx);
 		spa->spa_errlog_scrub = newobj;
 
 		(void) zap_update(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
 		    sizeof (uint64_t), 1, &spa->spa_errlog_scrub, tx);
 	}
 
 	mutex_exit(&spa->spa_errlog_lock);
 }
 
 #ifdef _KERNEL
 /*
  * If an error block is shared by two datasets it will be counted twice.
  */
 static int
 process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count)
 {
 	if (obj == 0)
 		return (0);
 
 	zap_cursor_t *zc;
 	zap_attribute_t *za;
 
 	zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
 	za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		for (zap_cursor_init(zc, spa->spa_meta_objset, obj);
 		    zap_cursor_retrieve(zc, za) == 0;
 		    zap_cursor_advance(zc)) {
 			if (*count == 0) {
 				zap_cursor_fini(zc);
 				kmem_free(zc, sizeof (*zc));
 				kmem_free(za, sizeof (*za));
 				return (SET_ERROR(ENOMEM));
 			}
 
 			zbookmark_phys_t zb;
 			name_to_bookmark(za->za_name, &zb);
 
 			int error = copyout_entry(&zb, uaddr, count);
 			if (error != 0) {
 				zap_cursor_fini(zc);
 				kmem_free(zc, sizeof (*zc));
 				kmem_free(za, sizeof (*za));
 				return (error);
 			}
 		}
 		zap_cursor_fini(zc);
 		kmem_free(zc, sizeof (*zc));
 		kmem_free(za, sizeof (*za));
 		return (0);
 	}
 
 	for (zap_cursor_init(zc, spa->spa_meta_objset, obj);
 	    zap_cursor_retrieve(zc, za) == 0;
 	    zap_cursor_advance(zc)) {
 
 		zap_cursor_t *head_ds_cursor;
 		zap_attribute_t *head_ds_attr;
 
 		head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
 		head_ds_attr = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
 
 		uint64_t head_ds_err_obj = za->za_first_integer;
 		uint64_t head_ds;
 		name_to_object(za->za_name, &head_ds);
 		for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset,
 		    head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor,
 		    head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) {
 
 			zbookmark_err_phys_t head_ds_block;
 			name_to_errphys(head_ds_attr->za_name, &head_ds_block);
 			int error = process_error_block(spa, head_ds,
 			    &head_ds_block, uaddr, count);
 
 			if (error != 0) {
 				zap_cursor_fini(head_ds_cursor);
 				kmem_free(head_ds_cursor,
 				    sizeof (*head_ds_cursor));
 				kmem_free(head_ds_attr, sizeof (*head_ds_attr));
 
 				zap_cursor_fini(zc);
 				kmem_free(za, sizeof (*za));
 				kmem_free(zc, sizeof (*zc));
 				return (error);
 			}
 		}
 		zap_cursor_fini(head_ds_cursor);
 		kmem_free(head_ds_cursor, sizeof (*head_ds_cursor));
 		kmem_free(head_ds_attr, sizeof (*head_ds_attr));
 	}
 	zap_cursor_fini(zc);
 	kmem_free(za, sizeof (*za));
 	kmem_free(zc, sizeof (*zc));
 	return (0);
 }
 
 static int
 process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count)
 {
 	spa_error_entry_t *se;
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		for (se = avl_first(list); se != NULL;
 		    se = AVL_NEXT(list, se)) {
 			int error =
 			    copyout_entry(&se->se_bookmark, uaddr, count);
 			if (error != 0) {
 				return (error);
 			}
 		}
 		return (0);
 	}
 
 	for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
 		uint64_t head_ds = 0;
 		int error = get_head_ds(spa, se->se_bookmark.zb_objset,
 		    &head_ds);
 
 		/*
 		 * If get_head_ds() errors out, set the head filesystem
 		 * to the filesystem stored in the bookmark of the
 		 * error block.
 		 */
 		if (error != 0)
 			head_ds = se->se_bookmark.zb_objset;
 
 		error = process_error_block(spa, head_ds,
 		    &se->se_zep, uaddr, count);
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 #endif
 
 /*
  * Copy all known errors to userland as an array of bookmarks.  This is
  * actually a union of the on-disk last log and current log, as well as any
  * pending error requests.
  *
  * Because the act of reading the on-disk log could cause errors to be
  * generated, we have two separate locks: one for the error log and one for the
  * in-core error lists.  We only need the error list lock to log and error, so
  * we grab the error log lock while we read the on-disk logs, and only pick up
  * the error list lock when we are finished.
  */
 int
 spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count)
 {
 	int ret = 0;
 
 #ifdef _KERNEL
 	/*
 	 * The pool config lock is needed to hold a dataset_t via (among other
 	 * places) process_error_list() -> process_error_block()->
 	 * find_top_affected_fs(), and lock ordering requires that we get it
 	 * before the spa_errlog_lock.
 	 */
 	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
 	mutex_enter(&spa->spa_errlog_lock);
 
 	ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
 
 	if (!ret && !spa->spa_scrub_finished)
 		ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
 		    count);
 
 	mutex_enter(&spa->spa_errlist_lock);
 	if (!ret)
 		ret = process_error_list(spa, &spa->spa_errlist_scrub, uaddr,
 		    count);
 	if (!ret)
 		ret = process_error_list(spa, &spa->spa_errlist_last, uaddr,
 		    count);
 	mutex_exit(&spa->spa_errlist_lock);
 
 	mutex_exit(&spa->spa_errlog_lock);
 	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 #else
 	(void) spa, (void) uaddr, (void) count;
 #endif
 
 	return (ret);
 }
 
 /*
  * Called when a scrub completes.  This simply set a bit which tells which AVL
  * tree to add new errors.  spa_errlog_sync() is responsible for actually
  * syncing the changes to the underlying objects.
  */
 void
 spa_errlog_rotate(spa_t *spa)
 {
 	mutex_enter(&spa->spa_errlist_lock);
 	spa->spa_scrub_finished = B_TRUE;
 	mutex_exit(&spa->spa_errlist_lock);
 }
 
 /*
  * Discard any pending errors from the spa_t.  Called when unloading a faulted
  * pool, as the errors encountered during the open cannot be synced to disk.
  */
 void
 spa_errlog_drain(spa_t *spa)
 {
 	spa_error_entry_t *se;
 	void *cookie;
 
 	mutex_enter(&spa->spa_errlist_lock);
 
 	cookie = NULL;
 	while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
 	    &cookie)) != NULL)
 		kmem_free(se, sizeof (spa_error_entry_t));
 	cookie = NULL;
 	while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
 	    &cookie)) != NULL)
 		kmem_free(se, sizeof (spa_error_entry_t));
 
 	mutex_exit(&spa->spa_errlist_lock);
 }
 
 /*
  * Process a list of errors into the current on-disk log.
  */
 void
 sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
 {
 	spa_error_entry_t *se;
 	char buf[NAME_MAX_LEN];
 	void *cookie;
 
 	if (avl_numnodes(t) == 0)
 		return;
 
 	/* create log if necessary */
 	if (*obj == 0)
 		*obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG,
 		    DMU_OT_NONE, 0, tx);
 
 	/* add errors to the current log */
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
 			bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
 
 			const char *name = se->se_name ? se->se_name : "";
 			(void) zap_update(spa->spa_meta_objset, *obj, buf, 1,
 			    strlen(name) + 1, name, tx);
 		}
 	} else {
 		for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
 			zbookmark_err_phys_t zep;
 			zep.zb_object = se->se_zep.zb_object;
 			zep.zb_level = se->se_zep.zb_level;
 			zep.zb_blkid = se->se_zep.zb_blkid;
 			zep.zb_birth = se->se_zep.zb_birth;
 
 			uint64_t head_ds = 0;
 			int error = get_head_ds(spa, se->se_bookmark.zb_objset,
 			    &head_ds);
 
 			/*
 			 * If get_head_ds() errors out, set the head filesystem
 			 * to the filesystem stored in the bookmark of the
 			 * error block.
 			 */
 			if (error != 0)
 				head_ds = se->se_bookmark.zb_objset;
 
 			uint64_t err_obj;
 			error = zap_lookup_int_key(spa->spa_meta_objset,
 			    *obj, head_ds, &err_obj);
 
 			if (error == ENOENT) {
 				err_obj = zap_create(spa->spa_meta_objset,
 				    DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
 
 				(void) zap_update_int_key(spa->spa_meta_objset,
 				    *obj, head_ds, err_obj, tx);
 			}
 			errphys_to_name(&zep, buf, sizeof (buf));
 
 			const char *name = se->se_name ? se->se_name : "";
 			(void) zap_update(spa->spa_meta_objset,
 			    err_obj, buf, 1, strlen(name) + 1, name, tx);
 		}
 	}
 	/* purge the error list */
 	cookie = NULL;
 	while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
 		kmem_free(se, sizeof (spa_error_entry_t));
 }
 
 static void
 delete_errlog(spa_t *spa, uint64_t spa_err_obj, dmu_tx_t *tx)
 {
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		zap_cursor_t zc;
 		zap_attribute_t za;
 		for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			VERIFY0(dmu_object_free(spa->spa_meta_objset,
 			    za.za_first_integer, tx));
 		}
 		zap_cursor_fini(&zc);
 	}
 	VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
 }
 
 /*
  * Sync the error log out to disk.  This is a little tricky because the act of
  * writing the error log requires the spa_errlist_lock.  So, we need to lock the
  * error lists, take a copy of the lists, and then reinitialize them.  Then, we
  * drop the error list lock and take the error log lock, at which point we
  * do the errlog processing.  Then, if we encounter an I/O error during this
  * process, we can successfully add the error to the list.  Note that this will
  * result in the perpetual recycling of errors, but it is an unlikely situation
  * and not a performance critical operation.
  */
 void
 spa_errlog_sync(spa_t *spa, uint64_t txg)
 {
 	dmu_tx_t *tx;
 	avl_tree_t scrub, last;
 	int scrub_finished;
 
 	mutex_enter(&spa->spa_errlist_lock);
 
 	/*
 	 * Bail out early under normal circumstances.
 	 */
 	if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
 	    avl_numnodes(&spa->spa_errlist_last) == 0 &&
 	    avl_numnodes(&spa->spa_errlist_healed) == 0 &&
 	    !spa->spa_scrub_finished) {
 		mutex_exit(&spa->spa_errlist_lock);
 		return;
 	}
 
 	spa_get_errlists(spa, &last, &scrub);
 	scrub_finished = spa->spa_scrub_finished;
 	spa->spa_scrub_finished = B_FALSE;
 
 	mutex_exit(&spa->spa_errlist_lock);
 
 	/*
 	 * The pool config lock is needed to hold a dataset_t via
 	 * sync_error_list() -> get_head_ds(), and lock ordering
 	 * requires that we get it before the spa_errlog_lock.
 	 */
 	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
 	mutex_enter(&spa->spa_errlog_lock);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	/*
 	 * Remove healed errors from errors.
 	 */
 	spa_remove_healed_errors(spa, &last, &scrub, tx);
 
 	/*
 	 * Sync out the current list of errors.
 	 */
 	sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
 
 	/*
 	 * Rotate the log if necessary.
 	 */
 	if (scrub_finished) {
 		if (spa->spa_errlog_last != 0)
 			delete_errlog(spa, spa->spa_errlog_last, tx);
 		spa->spa_errlog_last = spa->spa_errlog_scrub;
 		spa->spa_errlog_scrub = 0;
 
 		sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
 	}
 
 	/*
 	 * Sync out any pending scrub errors.
 	 */
 	sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
 
 	/*
 	 * Update the MOS to reflect the new values.
 	 */
 	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
 	    &spa->spa_errlog_last, tx);
 	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
 	    &spa->spa_errlog_scrub, tx);
 
 	dmu_tx_commit(tx);
 
 	mutex_exit(&spa->spa_errlog_lock);
 	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 }
 
 static void
 delete_dataset_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t ds,
     dmu_tx_t *tx)
 {
 	if (spa_err_obj == 0)
 		return;
 
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
 	    zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
 		uint64_t head_ds;
 		name_to_object(za.za_name, &head_ds);
 		if (head_ds == ds) {
 			(void) zap_remove(spa->spa_meta_objset, spa_err_obj,
 			    za.za_name, tx);
 			VERIFY0(dmu_object_free(spa->spa_meta_objset,
 			    za.za_first_integer, tx));
 			break;
 		}
 	}
 	zap_cursor_fini(&zc);
 }
 
 void
 spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx)
 {
 	mutex_enter(&spa->spa_errlog_lock);
 	delete_dataset_errlog(spa, spa->spa_errlog_scrub, ds, tx);
 	delete_dataset_errlog(spa, spa->spa_errlog_last, ds, tx);
 	mutex_exit(&spa->spa_errlog_lock);
 }
 
 static int
 find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head,
     uint64_t *txg)
 {
 	dsl_dataset_t *ds;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 
 	int error = dsl_dataset_hold_obj_flags(dp, old_head,
 	    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 
 	while (prev_obj != 0) {
 		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 		if ((error = dsl_dataset_hold_obj_flags(dp, prev_obj,
 		    DS_HOLD_FLAG_DECRYPT, FTAG, &ds)) == 0 &&
 		    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head)
 			break;
 
 		if (error != 0)
 			return (error);
 
 		prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 		prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	}
 	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 	ASSERT(prev_obj != 0);
 	*txg = prev_obj_txg;
 	return (0);
 }
 
 static void
 swap_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t new_head, uint64_t
     old_head, dmu_tx_t *tx)
 {
 	if (spa_err_obj == 0)
 		return;
 
 	uint64_t old_head_errlog;
 	int error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj,
 	    old_head, &old_head_errlog);
 
 	/* If no error log, then there is nothing to do. */
 	if (error != 0)
 		return;
 
 	uint64_t txg;
 	error = find_txg_ancestor_snapshot(spa, new_head, old_head, &txg);
 	if (error != 0)
 		return;
 
 	/*
 	 * Create an error log if the file system being promoted does not
 	 * already have one.
 	 */
 	uint64_t new_head_errlog;
 	error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, new_head,
 	    &new_head_errlog);
 
 	if (error != 0) {
 		new_head_errlog = zap_create(spa->spa_meta_objset,
 		    DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
 
 		(void) zap_update_int_key(spa->spa_meta_objset, spa_err_obj,
 		    new_head, new_head_errlog, tx);
 	}
 
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	zbookmark_err_phys_t err_block;
 	for (zap_cursor_init(&zc, spa->spa_meta_objset, old_head_errlog);
 	    zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
 
 		const char *name = "";
 		name_to_errphys(za.za_name, &err_block);
 		if (err_block.zb_birth < txg) {
 			(void) zap_update(spa->spa_meta_objset, new_head_errlog,
 			    za.za_name, 1, strlen(name) + 1, name, tx);
 
 			(void) zap_remove(spa->spa_meta_objset, old_head_errlog,
 			    za.za_name, tx);
 		}
 	}
 	zap_cursor_fini(&zc);
 }
 
 void
 spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds,
     dmu_tx_t *tx)
 {
 	mutex_enter(&spa->spa_errlog_lock);
 	swap_errlog(spa, spa->spa_errlog_scrub, new_head_ds, old_head_ds, tx);
 	swap_errlog(spa, spa->spa_errlog_last, new_head_ds, old_head_ds, tx);
 	mutex_exit(&spa->spa_errlog_lock);
 }
 
 #if defined(_KERNEL)
 /* error handling */
 EXPORT_SYMBOL(spa_log_error);
 EXPORT_SYMBOL(spa_approx_errlog_size);
 EXPORT_SYMBOL(spa_get_last_errlog_size);
 EXPORT_SYMBOL(spa_get_errlog);
 EXPORT_SYMBOL(spa_errlog_rotate);
 EXPORT_SYMBOL(spa_errlog_drain);
 EXPORT_SYMBOL(spa_errlog_sync);
 EXPORT_SYMBOL(spa_get_errlists);
 EXPORT_SYMBOL(spa_delete_dataset_errlog);
 EXPORT_SYMBOL(spa_swap_errlog);
 EXPORT_SYMBOL(sync_error_list);
 EXPORT_SYMBOL(spa_upgrade_errlog);
 EXPORT_SYMBOL(find_top_affected_fs);
 EXPORT_SYMBOL(find_birth_txg);
 EXPORT_SYMBOL(zep_to_zb);
 EXPORT_SYMBOL(name_to_errphys);
 #endif
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_spa, spa_, upgrade_errlog_limit, UINT, ZMOD_RW,
 	"Limit the number of errors which will be upgraded to the new "
 	"on-disk error log when enabling head_errlog");
 /* END CSTYLED */
diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c
index 873089a53e34..32158e8c592c 100644
--- a/module/zfs/spa_log_spacemap.c
+++ b/module/zfs/spa_log_spacemap.c
@@ -1,1407 +1,1407 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu_objset.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/spa_log_spacemap.h>
 #include <sys/vdev_impl.h>
 #include <sys/zap.h>
 
 /*
  * Log Space Maps
  *
  * Log space maps are an optimization in ZFS metadata allocations for pools
  * whose workloads are primarily random-writes. Random-write workloads are also
  * typically random-free, meaning that they are freeing from locations scattered
  * throughout the pool. This means that each TXG we will have to append some
  * FREE records to almost every metaslab. With log space maps, we hold their
  * changes in memory and log them altogether in one pool-wide space map on-disk
  * for persistence. As more blocks are accumulated in the log space maps and
  * more unflushed changes are accounted in memory, we flush a selected group
  * of metaslabs every TXG to relieve memory pressure and potential overheads
  * when loading the pool. Flushing a metaslab to disk relieves memory as we
  * flush any unflushed changes from memory to disk (i.e. the metaslab's space
  * map) and saves import time by making old log space maps obsolete and
  * eventually destroying them. [A log space map is said to be obsolete when all
  * its entries have made it to their corresponding metaslab space maps].
  *
  * == On disk data structures used ==
  *
  * - The pool has a new feature flag and a new entry in the MOS. The feature
  *   is activated when we create the first log space map and remains active
  *   for the lifetime of the pool. The new entry in the MOS Directory [refer
  *   to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value
  *   pairs are of the form <key: txg, value: log space map object for that txg>.
  *   This entry is our on-disk reference of the log space maps that exist in
  *   the pool for each TXG and it is used during import to load all the
  *   metaslab unflushed changes in memory. To see how this structure is first
  *   created and later populated refer to spa_generate_syncing_log_sm(). To see
  *   how it is used during import time refer to spa_ld_log_sm_metadata().
  *
  * - Each vdev has a new entry in its vdev_top_zap (see field
  *   VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of
  *   each metaslab in this vdev. This field is the on-disk counterpart of the
  *   in-memory field ms_unflushed_txg which tells us from which TXG and onwards
  *   the metaslab haven't had its changes flushed. During import, we use this
  *   to ignore any entries in the space map log that are for this metaslab but
  *   from a TXG before msp_unflushed_txg. At that point, we also populate its
  *   in-memory counterpart and from there both fields are updated every time
  *   we flush that metaslab.
  *
  * - A space map is created every TXG and, during that TXG, it is used to log
  *   all incoming changes (the log space map). When created, the log space map
  *   is referenced in memory by spa_syncing_log_sm and its object ID is inserted
  *   to the space map ZAP mentioned above. The log space map is closed at the
  *   end of the TXG and will be destroyed when it becomes fully obsolete. We
  *   know when a log space map has become obsolete by looking at the oldest
  *   (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger
  *   than the log space map's TXG, then it means that there is no metaslab who
  *   doesn't have the changes from that log and we can therefore destroy it.
  *   [see spa_cleanup_old_sm_logs()].
  *
  * == Important in-memory structures ==
  *
  * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in
  *   the pool by their ms_unflushed_txg field. It is primarily used for three
  *   reasons. First of all, it is used during flushing where we try to flush
  *   metaslabs in-order from the oldest-flushed to the most recently flushed
  *   every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the
  *   oldest flushed metaslab to distinguish which log space maps have become
  *   obsolete and which ones are still relevant. Finally it tells us which
  *   metaslabs have unflushed changes in a pool where this feature was just
  *   enabled, as we don't immediately add all of the pool's metaslabs but we
  *   add them over time as they go through metaslab_sync(). The reason that
  *   we do that is to ease these pools into the behavior of the flushing
  *   algorithm (described later on).
  *
  * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory
  *   counterpart of the space map ZAP mentioned above. It's an AVL tree whose
  *   nodes represent the log space maps in the pool. This in-memory
  *   representation of log space maps in the pool sorts the log space maps by
  *   the TXG that they were created (which is also the TXG of their unflushed
  *   changes). It also contains the following extra information for each
  *   space map:
  *   [1] The number of metaslabs that were last flushed on that TXG. This is
  *       important because if that counter is zero and this is the oldest
  *       log then it means that it is also obsolete.
  *   [2] The number of blocks of that space map. This field is used by the
  *       block heuristic of our flushing algorithm (described later on).
  *       It represents how many blocks of metadata changes ZFS had to write
  *       to disk for that TXG.
  *
  * - The per-spa field spa_log_summary is a list of entries that summarizes
  *   the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg
  *   AVL tree mentioned above. The reason this exists is that our flushing
  *   algorithm (described later) tries to estimate how many metaslabs to flush
  *   in each TXG by iterating over all the log space maps and looking at their
  *   block counts. Summarizing that information means that don't have to
  *   iterate through each space map, minimizing the runtime overhead of the
  *   flushing algorithm which would be induced in syncing context. In terms of
  *   implementation the log summary is used as a queue:
  *   * we modify or pop entries from its head when we flush metaslabs
  *   * we modify or append entries to its tail when we sync changes.
  *
  * - Each metaslab has two new range trees that hold its unflushed changes,
  *   ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint.
  *
  * == Flushing algorithm ==
  *
  * The decision of how many metaslabs to flush on a give TXG is guided by
  * two heuristics:
  *
  * [1] The memory heuristic -
  * We keep track of the memory used by the unflushed trees from all the
  * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it
  * stays below a certain threshold which is determined by an arbitrary hard
  * limit and an arbitrary percentage of the system's memory [see
  * spa_log_exceeds_memlimit()]. When we see that the memory usage of the
  * unflushed changes are passing that threshold, we flush metaslabs, which
  * empties their unflushed range trees, reducing the memory used.
  *
  * [2] The block heuristic -
  * We try to keep the total number of blocks in the log space maps in check
  * so the log doesn't grow indefinitely and we don't induce a lot of overhead
  * when loading the pool. At the same time we don't want to flush a lot of
  * metaslabs too often as this would defeat the purpose of the log space map.
  * As a result we set a limit in the amount of blocks that we think it's
  * acceptable for the log space maps to have and try not to cross it.
  * [see sus_blocklimit from spa_unflushed_stats].
  *
  * In order to stay below the block limit every TXG we have to estimate how
  * many metaslabs we need to flush based on the current rate of incoming blocks
  * and our history of log space map blocks. The main idea here is to answer
  * the question of how many metaslabs do we need to flush in order to get rid
  * at least an X amount of log space map blocks. We can answer this question
  * by iterating backwards from the oldest log space map to the newest one
  * and looking at their metaslab and block counts. At this point the log summary
  * mentioned above comes handy as it reduces the amount of things that we have
  * to iterate (even though it may reduce the preciseness of our estimates due
  * to its aggregation of data). So with that in mind, we project the incoming
  * rate of the current TXG into the future and attempt to approximate how many
  * metaslabs would we need to flush from now in order to avoid exceeding our
  * block limit in different points in the future (granted that we would keep
  * flushing the same number of metaslabs for every TXG). Then we take the
  * maximum number from all these estimates to be on the safe side. For the
  * exact implementation details of algorithm refer to
  * spa_estimate_metaslabs_to_flush.
  */
 
 /*
  * This is used as the block size for the space maps used for the
  * log space map feature. These space maps benefit from a bigger
  * block size as we expect to be writing a lot of data to them at
  * once.
  */
 static const unsigned long zfs_log_sm_blksz = 1ULL << 17;
 
 /*
  * Percentage of the overall system's memory that ZFS allows to be
  * used for unflushed changes (e.g. the sum of size of all the nodes
  * in the unflushed trees).
  *
  * Note that this value is calculated over 1000000 for finer granularity
  * (thus the _ppm suffix; reads as "parts per million"). As an example,
  * the default of 1000 allows 0.1% of memory to be used.
  */
 static uint64_t zfs_unflushed_max_mem_ppm = 1000;
 
 /*
  * Specific hard-limit in memory that ZFS allows to be used for
  * unflushed changes.
  */
 static uint64_t zfs_unflushed_max_mem_amt = 1ULL << 30;
 
 /*
  * The following tunable determines the number of blocks that can be used for
  * the log space maps. It is expressed as a percentage of the total number of
  * metaslabs in the pool (i.e. the default of 400 means that the number of log
  * blocks is capped at 4 times the number of metaslabs).
  *
  * This value exists to tune our flushing algorithm, with higher values
  * flushing metaslabs less often (doing less I/Os) per TXG versus lower values
  * flushing metaslabs more aggressively with the upside of saving overheads
  * when loading the pool. Another factor in this tradeoff is that flushing
  * less often can potentially lead to better utilization of the metaslab space
  * map's block size as we accumulate more changes per flush.
  *
  * Given that this tunable indirectly controls the flush rate (metaslabs
  * flushed per txg) and that's why making it a percentage in terms of the
  * number of metaslabs in the pool makes sense here.
  *
  * As a rule of thumb we default this tunable to 400% based on the following:
  *
  * 1] Assuming a constant flush rate and a constant incoming rate of log blocks
  *    it is reasonable to expect that the amount of obsolete entries changes
  *    linearly from txg to txg (e.g. the oldest log should have the most
  *    obsolete entries, and the most recent one the least). With this we could
  *    say that, at any given time, about half of the entries in the whole space
  *    map log are obsolete. Thus for every two entries for a metaslab in the
  *    log space map, only one of them is valid and actually makes it to the
  *    metaslab's space map.
  *    [factor of 2]
  * 2] Each entry in the log space map is guaranteed to be two words while
  *    entries in metaslab space maps are generally single-word.
  *    [an extra factor of 2 - 400% overall]
  * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into
  *    account any consolidation of segments from the log space map to the
  *    unflushed range trees nor their history (e.g. a segment being allocated,
  *    then freed, then allocated again means 3 log space map entries but 0
  *    metaslab space map entries). Depending on the workload, we've seen ~1.8
  *    non-obsolete log space map entries per metaslab entry, for a total of
  *    ~600%. Since most of these estimates though are workload dependent, we
  *    default on 400% to be conservative.
  *
  *    Thus we could say that even in the worst
  *    case of [1] and [2], the factor should end up being 4.
  *
  * That said, regardless of the number of metaslabs in the pool we need to
  * provide upper and lower bounds for the log block limit.
  * [see zfs_unflushed_log_block_{min,max}]
  */
 static uint_t zfs_unflushed_log_block_pct = 400;
 
 /*
  * If the number of metaslabs is small and our incoming rate is high, we could
  * get into a situation that we are flushing all our metaslabs every TXG. Thus
  * we always allow at least this many log blocks.
  */
 static uint64_t zfs_unflushed_log_block_min = 1000;
 
 /*
  * If the log becomes too big, the import time of the pool can take a hit in
  * terms of performance. Thus we have a hard limit in the size of the log in
  * terms of blocks.
  */
 static uint64_t zfs_unflushed_log_block_max = (1ULL << 17);
 
 /*
  * Also we have a hard limit in the size of the log in terms of dirty TXGs.
  */
 static uint64_t zfs_unflushed_log_txg_max = 1000;
 
 /*
  * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
  * stability of the flushing algorithm (longer summary) vs its runtime overhead
  * (smaller summary is faster to traverse).
  */
 static uint64_t zfs_max_logsm_summary_length = 10;
 
 /*
  * Tunable that sets the lower bound on the metaslabs to flush every TXG.
  *
  * Setting this to 0 has no effect since if the pool is idle we won't even be
  * creating log space maps and therefore we won't be flushing. On the other
  * hand if the pool has any incoming workload our block heuristic will start
  * flushing metaslabs anyway.
  *
  * The point of this tunable is to be used in extreme cases where we really
  * want to flush more metaslabs than our adaptable heuristic plans to flush.
  */
 static uint64_t zfs_min_metaslabs_to_flush = 1;
 
 /*
  * Tunable that specifies how far in the past do we want to look when trying to
  * estimate the incoming log blocks for the current TXG.
  *
  * Setting this too high may not only increase runtime but also minimize the
  * effect of the incoming rates from the most recent TXGs as we take the
  * average over all the blocks that we walk
  * [see spa_estimate_incoming_log_blocks].
  */
 static uint64_t zfs_max_log_walking = 5;
 
 /*
  * This tunable exists solely for testing purposes. It ensures that the log
  * spacemaps are not flushed and destroyed during export in order for the
  * relevant log spacemap import code paths to be tested (effectively simulating
  * a crash).
  */
 int zfs_keep_log_spacemaps_at_export = 0;
 
 static uint64_t
 spa_estimate_incoming_log_blocks(spa_t *spa)
 {
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 	uint64_t steps = 0, sum = 0;
 	for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
 	    sls != NULL && steps < zfs_max_log_walking;
 	    sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) {
 		if (sls->sls_txg == spa_syncing_txg(spa)) {
 			/*
 			 * skip the log created in this TXG as this would
 			 * make our estimations inaccurate.
 			 */
 			continue;
 		}
 		sum += sls->sls_nblocks;
 		steps++;
 	}
 	return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0);
 }
 
 uint64_t
 spa_log_sm_blocklimit(spa_t *spa)
 {
 	return (spa->spa_unflushed_stats.sus_blocklimit);
 }
 
 void
 spa_log_sm_set_blocklimit(spa_t *spa)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
 		ASSERT0(spa_log_sm_blocklimit(spa));
 		return;
 	}
 
 	uint64_t msdcount = 0;
 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	    e; e = list_next(&spa->spa_log_summary, e))
 		msdcount += e->lse_msdcount;
 
 	uint64_t limit = msdcount * zfs_unflushed_log_block_pct / 100;
 	spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(limit,
 	    zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
 }
 
 uint64_t
 spa_log_sm_nblocks(spa_t *spa)
 {
 	return (spa->spa_unflushed_stats.sus_nblocks);
 }
 
 /*
  * Ensure that the in-memory log space map structures and the summary
  * have the same block and metaslab counts.
  */
 static void
 spa_log_summary_verify_counts(spa_t *spa)
 {
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0)
 		return;
 
 	uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed);
 
 	uint64_t ms_in_summary = 0, blk_in_summary = 0;
 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	    e; e = list_next(&spa->spa_log_summary, e)) {
 		ms_in_summary += e->lse_mscount;
 		blk_in_summary += e->lse_blkcount;
 	}
 
 	uint64_t ms_in_logs = 0, blk_in_logs = 0;
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
 		ms_in_logs += sls->sls_mscount;
 		blk_in_logs += sls->sls_nblocks;
 	}
 
 	VERIFY3U(ms_in_logs, ==, ms_in_summary);
 	VERIFY3U(ms_in_logs, ==, ms_in_avl);
 	VERIFY3U(blk_in_logs, ==, blk_in_summary);
 	VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa));
 }
 
 static boolean_t
 summary_entry_is_full(spa_t *spa, log_summary_entry_t *e, uint64_t txg)
 {
 	if (e->lse_end == txg)
 		return (0);
 	if (e->lse_txgcount >= DIV_ROUND_UP(zfs_unflushed_log_txg_max,
 	    zfs_max_logsm_summary_length))
 		return (1);
 	uint64_t blocks_per_row = MAX(1,
 	    DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
 	    zfs_max_logsm_summary_length));
 	return (blocks_per_row <= e->lse_blkcount);
 }
 
 /*
  * Update the log summary information to reflect the fact that a metaslab
  * was flushed or destroyed (e.g due to device removal or pool export/destroy).
  *
  * We typically flush the oldest flushed metaslab so the first (and oldest)
  * entry of the summary is updated. However if that metaslab is getting loaded
  * we may flush the second oldest one which may be part of an entry later in
  * the summary. Moreover, if we call into this function from metaslab_fini()
  * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask
  * for a txg as an argument so we can locate the appropriate summary entry for
  * the metaslab.
  */
 void
 spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg, boolean_t dirty)
 {
 	/*
 	 * We don't track summary data for read-only pools and this function
 	 * can be called from metaslab_fini(). In that case return immediately.
 	 */
 	if (!spa_writeable(spa))
 		return;
 
 	log_summary_entry_t *target = NULL;
 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	    e != NULL; e = list_next(&spa->spa_log_summary, e)) {
 		if (e->lse_start > txg)
 			break;
 		target = e;
 	}
 
 	if (target == NULL || target->lse_mscount == 0) {
 		/*
 		 * We didn't find a summary entry for this metaslab. We must be
 		 * at the teardown of a spa_load() attempt that got an error
 		 * while reading the log space maps.
 		 */
 		VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
 		return;
 	}
 
 	target->lse_mscount--;
 	if (dirty)
 		target->lse_msdcount--;
 }
 
 /*
  * Update the log summary information to reflect the fact that we destroyed
  * old log space maps. Since we can only destroy the oldest log space maps,
  * we decrement the block count of the oldest summary entry and potentially
  * destroy it when that count hits 0.
  *
  * This function is called after a metaslab is flushed and typically that
  * metaslab is the oldest flushed, which means that this function will
  * typically decrement the block count of the first entry of the summary and
  * potentially free it if the block count gets to zero (its metaslab count
  * should be zero too at that point).
  *
  * There are certain scenarios though that don't work exactly like that so we
  * need to account for them:
  *
  * Scenario [1]: It is possible that after we flushed the oldest flushed
  * metaslab and we destroyed the oldest log space map, more recent logs had 0
  * metaslabs pointing to them so we got rid of them too. This can happen due
  * to metaslabs being destroyed through device removal, or because the oldest
  * flushed metaslab was loading but we kept flushing more recently flushed
  * metaslabs due to the memory pressure of unflushed changes. Because of that,
  * we always iterate from the beginning of the summary and if blocks_gone is
  * bigger than the block_count of the current entry we free that entry (we
  * expect its metaslab count to be zero), we decrement blocks_gone and on to
  * the next entry repeating this procedure until blocks_gone gets decremented
  * to 0. Doing this also works for the typical case mentioned above.
  *
  * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by
  * the first (and oldest) entry in the summary. If the first few entries of
  * the summary were only accounting metaslabs from a device that was just
  * removed, then the current oldest flushed metaslab could be accounted by an
  * entry somewhere in the middle of the summary. Moreover flushing that
  * metaslab will destroy all the log space maps older than its ms_unflushed_txg
  * because they became obsolete after the removal. Thus, iterating as we did
  * for scenario [1] works out for this case too.
  *
  * Scenario [3]: At times we decide to flush all the metaslabs in the pool
  * in one TXG (either because we are exporting the pool or because our flushing
  * heuristics decided to do so). When that happens all the log space maps get
  * destroyed except the one created for the current TXG which doesn't have
  * any log blocks yet. As log space maps get destroyed with every metaslab that
  * we flush, entries in the summary are also destroyed. This brings a weird
  * corner-case when we flush the last metaslab and the log space map of the
  * current TXG is in the same summary entry with other log space maps that
  * are older. When that happens we are eventually left with this one last
  * summary entry whose blocks are gone (blocks_gone equals the entry's block
  * count) but its metaslab count is non-zero (because it accounts all the
  * metaslabs in the pool as they all got flushed). Under this scenario we can't
  * free this last summary entry as it's referencing all the metaslabs in the
  * pool and its block count will get incremented at the end of this sync (when
  * we close the syncing log space map). Thus we just decrement its current
  * block count and leave it alone. In the case that the pool gets exported,
  * its metaslab count will be decremented over time as we call metaslab_fini()
  * for all the metaslabs in the pool and the entry will be freed at
  * spa_unload_log_sm_metadata().
  */
 void
 spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
 {
 	log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	ASSERT3P(e, !=, NULL);
 	if (e->lse_txgcount > 0)
 		e->lse_txgcount--;
 	for (; e != NULL; e = list_head(&spa->spa_log_summary)) {
 		if (e->lse_blkcount > blocks_gone) {
 			e->lse_blkcount -= blocks_gone;
 			blocks_gone = 0;
 			break;
 		} else if (e->lse_mscount == 0) {
 			/* remove obsolete entry */
 			blocks_gone -= e->lse_blkcount;
 			list_remove(&spa->spa_log_summary, e);
 			kmem_free(e, sizeof (log_summary_entry_t));
 		} else {
 			/* Verify that this is scenario [3] mentioned above. */
 			VERIFY3U(blocks_gone, ==, e->lse_blkcount);
 
 			/*
 			 * Assert that this is scenario [3] further by ensuring
 			 * that this is the only entry in the summary.
 			 */
 			VERIFY3P(e, ==, list_tail(&spa->spa_log_summary));
 			ASSERT3P(e, ==, list_head(&spa->spa_log_summary));
 
 			blocks_gone = e->lse_blkcount = 0;
 			break;
 		}
 	}
 
 	/*
 	 * Ensure that there is no way we are trying to remove more blocks
 	 * than the # of blocks in the summary.
 	 */
 	ASSERT0(blocks_gone);
 }
 
 void
 spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg)
 {
 	spa_log_sm_t target = { .sls_txg = txg };
 	spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
 	    &target, NULL);
 
 	if (sls == NULL) {
 		/*
 		 * We must be at the teardown of a spa_load() attempt that
 		 * got an error while reading the log space maps.
 		 */
 		VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
 		return;
 	}
 
 	ASSERT(sls->sls_mscount > 0);
 	sls->sls_mscount--;
 }
 
 void
 spa_log_sm_increment_current_mscount(spa_t *spa)
 {
 	spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg);
 	ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa));
 	last_sls->sls_mscount++;
 }
 
 static void
 summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
     uint64_t metaslabs_dirty, uint64_t nblocks)
 {
 	log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
 
 	if (e == NULL || summary_entry_is_full(spa, e, txg)) {
 		e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
 		e->lse_start = e->lse_end = txg;
 		e->lse_txgcount = 1;
 		list_insert_tail(&spa->spa_log_summary, e);
 	}
 
 	ASSERT3U(e->lse_start, <=, txg);
 	if (e->lse_end < txg) {
 		e->lse_end = txg;
 		e->lse_txgcount++;
 	}
 	e->lse_mscount += metaslabs_flushed;
 	e->lse_msdcount += metaslabs_dirty;
 	e->lse_blkcount += nblocks;
 }
 
 static void
 spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
 {
 	summary_add_data(spa, spa_syncing_txg(spa), 0, 0, nblocks);
 }
 
 void
 spa_log_summary_add_flushed_metaslab(spa_t *spa, boolean_t dirty)
 {
 	summary_add_data(spa, spa_syncing_txg(spa), 1, dirty ? 1 : 0, 0);
 }
 
 void
 spa_log_summary_dirty_flushed_metaslab(spa_t *spa, uint64_t txg)
 {
 	log_summary_entry_t *target = NULL;
 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	    e != NULL; e = list_next(&spa->spa_log_summary, e)) {
 		if (e->lse_start > txg)
 			break;
 		target = e;
 	}
 	ASSERT3P(target, !=, NULL);
 	ASSERT3U(target->lse_mscount, !=, 0);
 	target->lse_msdcount++;
 }
 
 /*
  * This function attempts to estimate how many metaslabs should
  * we flush to satisfy our block heuristic for the log spacemap
  * for the upcoming TXGs.
  *
  * Specifically, it first tries to estimate the number of incoming
  * blocks in this TXG. Then by projecting that incoming rate to
  * future TXGs and using the log summary, it figures out how many
  * flushes we would need to do for future TXGs individually to
  * stay below our block limit and returns the maximum number of
  * flushes from those estimates.
  */
 static uint64_t
 spa_estimate_metaslabs_to_flush(spa_t *spa)
 {
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 	ASSERT(spa_log_sm_blocklimit(spa) != 0);
 
 	/*
 	 * This variable contains the incoming rate that will be projected
 	 * and used for our flushing estimates in the future.
 	 */
 	uint64_t incoming = spa_estimate_incoming_log_blocks(spa);
 
 	/*
 	 * At any point in time this variable tells us how many
 	 * TXGs in the future we are so we can make our estimations.
 	 */
 	uint64_t txgs_in_future = 1;
 
 	/*
 	 * This variable tells us how much room do we have until we hit
 	 * our limit. When it goes negative, it means that we've exceeded
 	 * our limit and we need to flush.
 	 *
 	 * Note that since we start at the first TXG in the future (i.e.
 	 * txgs_in_future starts from 1) we already decrement this
 	 * variable by the incoming rate.
 	 */
 	int64_t available_blocks =
 	    spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
 
 	int64_t available_txgs = zfs_unflushed_log_txg_max;
 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	    e; e = list_next(&spa->spa_log_summary, e))
 		available_txgs -= e->lse_txgcount;
 
 	/*
 	 * This variable tells us the total number of flushes needed to
 	 * keep the log size within the limit when we reach txgs_in_future.
 	 */
 	uint64_t total_flushes = 0;
 
 	/* Holds the current maximum of our estimates so far. */
 	uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush;
 
 	/*
 	 * For our estimations we only look as far in the future
 	 * as the summary allows us.
 	 */
 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	    e; e = list_next(&spa->spa_log_summary, e)) {
 
 		/*
 		 * If there is still room before we exceed our limit
 		 * then keep skipping TXGs accumulating more blocks
 		 * based on the incoming rate until we exceed it.
 		 */
 		if (available_blocks >= 0 && available_txgs >= 0) {
 			uint64_t skip_txgs = (incoming == 0) ?
 			    available_txgs + 1 : MIN(available_txgs + 1,
 			    (available_blocks / incoming) + 1);
 			available_blocks -= (skip_txgs * incoming);
 			available_txgs -= skip_txgs;
 			txgs_in_future += skip_txgs;
 			ASSERT3S(available_blocks, >=, -incoming);
 			ASSERT3S(available_txgs, >=, -1);
 		}
 
 		/*
 		 * At this point we're far enough into the future where
 		 * the limit was just exceeded and we flush metaslabs
 		 * based on the current entry in the summary, updating
 		 * our available_blocks.
 		 */
 		ASSERT(available_blocks < 0 || available_txgs < 0);
 		available_blocks += e->lse_blkcount;
 		available_txgs += e->lse_txgcount;
 		total_flushes += e->lse_msdcount;
 
 		/*
 		 * Keep the running maximum of the total_flushes that
 		 * we've done so far over the number of TXGs in the
 		 * future that we are. The idea here is to estimate
 		 * the average number of flushes that we should do
 		 * every TXG so that when we are that many TXGs in the
 		 * future we stay under the limit.
 		 */
 		max_flushes_pertxg = MAX(max_flushes_pertxg,
 		    DIV_ROUND_UP(total_flushes, txgs_in_future));
 	}
 	return (max_flushes_pertxg);
 }
 
 uint64_t
 spa_log_sm_memused(spa_t *spa)
 {
 	return (spa->spa_unflushed_stats.sus_memused);
 }
 
 static boolean_t
 spa_log_exceeds_memlimit(spa_t *spa)
 {
 	if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt)
 		return (B_TRUE);
 
 	uint64_t system_mem_allowed = ((physmem * PAGESIZE) *
 	    zfs_unflushed_max_mem_ppm) / 1000000;
 	if (spa_log_sm_memused(spa) > system_mem_allowed)
 		return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 boolean_t
 spa_flush_all_logs_requested(spa_t *spa)
 {
 	return (spa->spa_log_flushall_txg != 0);
 }
 
 void
 spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
 {
 	uint64_t txg = dmu_tx_get_txg(tx);
 
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	/*
 	 * If we don't have any metaslabs with unflushed changes
 	 * return immediately.
 	 */
 	if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0)
 		return;
 
 	/*
 	 * During SPA export we leave a few empty TXGs to go by [see
 	 * spa_final_dirty_txg() to understand why]. For this specific
 	 * case, it is important to not flush any metaslabs as that
 	 * would dirty this TXG.
 	 *
 	 * That said, during one of these dirty TXGs that is less or
 	 * equal to spa_final_dirty(), spa_unload() will request that
 	 * we try to flush all the metaslabs for that TXG before
 	 * exporting the pool, thus we ensure that we didn't get a
 	 * request of flushing everything before we attempt to return
 	 * immediately.
 	 */
-	if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+	if (BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
 	    !dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
 	    !spa_flush_all_logs_requested(spa))
 		return;
 
 	/*
 	 * We need to generate a log space map before flushing because this
 	 * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg)
 	 * for this TXG's flushed metaslab count (aka sls_mscount which is
 	 * manipulated in many ways down the metaslab_flush() codepath).
 	 *
 	 * That is not to say that we may generate a log space map when we
 	 * don't need it. If we are flushing metaslabs, that means that we
 	 * were going to write changes to disk anyway, so even if we were
 	 * not flushing, a log space map would have been created anyway in
 	 * metaslab_sync().
 	 */
 	spa_generate_syncing_log_sm(spa, tx);
 
 	/*
 	 * This variable tells us how many metaslabs we want to flush based
 	 * on the block-heuristic of our flushing algorithm (see block comment
 	 * of log space map feature). We also decrement this as we flush
 	 * metaslabs and attempt to destroy old log space maps.
 	 */
 	uint64_t want_to_flush;
 	if (spa_flush_all_logs_requested(spa)) {
 		ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
 		want_to_flush = UINT64_MAX;
 	} else {
 		want_to_flush = spa_estimate_metaslabs_to_flush(spa);
 	}
 
 	/* Used purely for verification purposes */
 	uint64_t visited = 0;
 
 	/*
 	 * Ideally we would only iterate through spa_metaslabs_by_flushed
 	 * using only one variable (curr). We can't do that because
 	 * metaslab_flush() mutates position of curr in the AVL when
 	 * it flushes that metaslab by moving it to the end of the tree.
 	 * Thus we always keep track of the original next node of the
 	 * current node (curr) in another variable (next).
 	 */
 	metaslab_t *next = NULL;
 	for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed);
 	    curr != NULL; curr = next) {
 		next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr);
 
 		/*
 		 * If this metaslab has been flushed this txg then we've done
 		 * a full circle over the metaslabs.
 		 */
 		if (metaslab_unflushed_txg(curr) == txg)
 			break;
 
 		/*
 		 * If we are done flushing for the block heuristic and the
 		 * unflushed changes don't exceed the memory limit just stop.
 		 */
 		if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
 			break;
 
 		if (metaslab_unflushed_dirty(curr)) {
 			mutex_enter(&curr->ms_sync_lock);
 			mutex_enter(&curr->ms_lock);
 			metaslab_flush(curr, tx);
 			mutex_exit(&curr->ms_lock);
 			mutex_exit(&curr->ms_sync_lock);
 			if (want_to_flush > 0)
 				want_to_flush--;
 		} else
 			metaslab_unflushed_bump(curr, tx, B_FALSE);
 
 		visited++;
 	}
 	ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
 
 	spa_log_sm_set_blocklimit(spa);
 }
 
 /*
  * Close the log space map for this TXG and update the block counts
  * for the log's in-memory structure and the summary.
  */
 void
 spa_sync_close_syncing_log_sm(spa_t *spa)
 {
 	if (spa_syncing_log_sm(spa) == NULL)
 		return;
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
 	ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa));
 
 	sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa));
 	spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
 
 	/*
 	 * Note that we can't assert that sls_mscount is not 0,
 	 * because there is the case where the first metaslab
 	 * in spa_metaslabs_by_flushed is loading and we were
 	 * not able to flush any metaslabs the current TXG.
 	 */
 	ASSERT(sls->sls_nblocks != 0);
 
 	spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks);
 	spa_log_summary_verify_counts(spa);
 
 	space_map_close(spa->spa_syncing_log_sm);
 	spa->spa_syncing_log_sm = NULL;
 
 	/*
 	 * At this point we tried to flush as many metaslabs as we
 	 * can as the pool is getting exported. Reset the "flush all"
 	 * so the last few TXGs before closing the pool can be empty
 	 * (e.g. not dirty).
 	 */
 	if (spa_flush_all_logs_requested(spa)) {
 		ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
 		spa->spa_log_flushall_txg = 0;
 	}
 }
 
 void
 spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
 {
 	objset_t *mos = spa_meta_objset(spa);
 
 	uint64_t spacemap_zap;
 	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
 	if (error == ENOENT) {
 		ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
 		return;
 	}
 	VERIFY0(error);
 
 	metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed);
 	uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest);
 
 	/* Free all log space maps older than the oldest_flushed_txg. */
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls && sls->sls_txg < oldest_flushed_txg;
 	    sls = avl_first(&spa->spa_sm_logs_by_txg)) {
 		ASSERT0(sls->sls_mscount);
 		avl_remove(&spa->spa_sm_logs_by_txg, sls);
 		space_map_free_obj(mos, sls->sls_sm_obj, tx);
 		VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
 		spa_log_summary_decrement_blkcount(spa, sls->sls_nblocks);
 		spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
 		kmem_free(sls, sizeof (spa_log_sm_t));
 	}
 }
 
 static spa_log_sm_t *
 spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg)
 {
 	spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP);
 	sls->sls_sm_obj = sm_obj;
 	sls->sls_txg = txg;
 	return (sls);
 }
 
 void
 spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
 {
 	uint64_t txg = dmu_tx_get_txg(tx);
 	objset_t *mos = spa_meta_objset(spa);
 
 	if (spa_syncing_log_sm(spa) != NULL)
 		return;
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	uint64_t spacemap_zap;
 	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
 	if (error == ENOENT) {
 		ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
 
 		error = 0;
 		spacemap_zap = zap_create(mos,
 		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
 		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1,
 		    &spacemap_zap, tx));
 		spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx);
 	}
 	VERIFY0(error);
 
 	uint64_t sm_obj;
 	ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj),
 	    ==, ENOENT);
 	sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx);
 	VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx));
 	avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg));
 
 	/*
 	 * We pass UINT64_MAX as the space map's representation size
 	 * and SPA_MINBLOCKSHIFT as the shift, to make the space map
 	 * accept any sorts of segments since there's no real advantage
 	 * to being more restrictive (given that we're already going
 	 * to be using 2-word entries).
 	 */
 	VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
 	    0, UINT64_MAX, SPA_MINBLOCKSHIFT));
 
 	spa_log_sm_set_blocklimit(spa);
 }
 
 /*
  * Find all the log space maps stored in the space map ZAP and sort
  * them by their TXG in spa_sm_logs_by_txg.
  */
 static int
 spa_ld_log_sm_metadata(spa_t *spa)
 {
 	int error;
 	uint64_t spacemap_zap;
 
 	ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
 
 	error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
 	if (error == ENOENT) {
 		/* the space map ZAP doesn't exist yet */
 		return (0);
 	} else if (error != 0) {
 		spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
 		    "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]",
 		    error);
 		return (error);
 	}
 
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap);
 	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t log_txg = zfs_strtonum(za.za_name, NULL);
 		spa_log_sm_t *sls =
 		    spa_log_sm_alloc(za.za_first_integer, log_txg);
 		avl_add(&spa->spa_sm_logs_by_txg, sls);
 	}
 	zap_cursor_fini(&zc);
 	if (error != ENOENT) {
 		spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
 		    "zap_cursor_retrieve(spacemap_zap) [error %d]",
 		    error);
 		return (error);
 	}
 
 	for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
 	    m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
 		spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) };
 		spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
 		    &target, NULL);
 
 		/*
 		 * At this point if sls is zero it means that a bug occurred
 		 * in ZFS the last time the pool was open or earlier in the
 		 * import code path. In general, we would have placed a
 		 * VERIFY() here or in this case just let the kernel panic
 		 * with NULL pointer dereference when incrementing sls_mscount,
 		 * but since this is the import code path we can be a bit more
 		 * lenient. Thus, for DEBUG bits we always cause a panic, while
 		 * in production we log the error and just fail the import.
 		 */
 		ASSERT(sls != NULL);
 		if (sls == NULL) {
 			spa_load_failed(spa, "spa_ld_log_sm_metadata(): bug "
 			    "encountered: could not find log spacemap for "
 			    "TXG %llu [error %d]",
 			    (u_longlong_t)metaslab_unflushed_txg(m), ENOENT);
 			return (ENOENT);
 		}
 		sls->sls_mscount++;
 	}
 
 	return (0);
 }
 
 typedef struct spa_ld_log_sm_arg {
 	spa_t *slls_spa;
 	uint64_t slls_txg;
 } spa_ld_log_sm_arg_t;
 
 static int
 spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
 {
 	uint64_t offset = sme->sme_offset;
 	uint64_t size = sme->sme_run;
 	uint32_t vdev_id = sme->sme_vdev;
 
 	spa_ld_log_sm_arg_t *slls = arg;
 	spa_t *spa = slls->slls_spa;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 
 	/*
 	 * If the vdev has been removed (i.e. it is indirect or a hole)
 	 * skip this entry. The contents of this vdev have already moved
 	 * elsewhere.
 	 */
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	ASSERT(!ms->ms_loaded);
 
 	/*
 	 * If we have already flushed entries for this TXG to this
 	 * metaslab's space map, then ignore it. Note that we flush
 	 * before processing any allocations/frees for that TXG, so
 	 * the metaslab's space map only has entries from *before*
 	 * the unflushed TXG.
 	 */
 	if (slls->slls_txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 	switch (sme->sme_type) {
 	case SM_ALLOC:
 		range_tree_remove_xor_add_segment(offset, offset + size,
 		    ms->ms_unflushed_frees, ms->ms_unflushed_allocs);
 		break;
 	case SM_FREE:
 		range_tree_remove_xor_add_segment(offset, offset + size,
 		    ms->ms_unflushed_allocs, ms->ms_unflushed_frees);
 		break;
 	default:
 		panic("invalid maptype_t");
 		break;
 	}
 	if (!metaslab_unflushed_dirty(ms)) {
 		metaslab_set_unflushed_dirty(ms, B_TRUE);
 		spa_log_summary_dirty_flushed_metaslab(spa,
 		    metaslab_unflushed_txg(ms));
 	}
 	return (0);
 }
 
 static int
 spa_ld_log_sm_data(spa_t *spa)
 {
 	spa_log_sm_t *sls, *psls;
 	int error = 0;
 
 	/*
 	 * If we are not going to do any writes there is no need
 	 * to read the log space maps.
 	 */
 	if (!spa_writeable(spa))
 		return (0);
 
 	ASSERT0(spa->spa_unflushed_stats.sus_nblocks);
 	ASSERT0(spa->spa_unflushed_stats.sus_memused);
 
 	hrtime_t read_logs_starttime = gethrtime();
 
 	/* Prefetch log spacemaps dnodes. */
 	for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
 	    sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
 		dmu_prefetch_dnode(spa_meta_objset(spa), sls->sls_sm_obj,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
 
 	uint_t pn = 0;
 	uint64_t ps = 0;
 	uint64_t nsm = 0;
 	psls = sls = avl_first(&spa->spa_sm_logs_by_txg);
 	while (sls != NULL) {
 		/* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */
 		if (psls != NULL && pn < 16 &&
 		    (pn < 2 || ps < 2 * dmu_prefetch_max)) {
 			error = space_map_open(&psls->sls_sm,
 			    spa_meta_objset(spa), psls->sls_sm_obj, 0,
 			    UINT64_MAX, SPA_MINBLOCKSHIFT);
 			if (error != 0) {
 				spa_load_failed(spa, "spa_ld_log_sm_data(): "
 				    "failed at space_map_open(obj=%llu) "
 				    "[error %d]",
 				    (u_longlong_t)sls->sls_sm_obj, error);
 				goto out;
 			}
 			dmu_prefetch(spa_meta_objset(spa), psls->sls_sm_obj,
 			    0, 0, space_map_length(psls->sls_sm),
 			    ZIO_PRIORITY_ASYNC_READ);
 			pn++;
 			ps += space_map_length(psls->sls_sm);
 			psls = AVL_NEXT(&spa->spa_sm_logs_by_txg, psls);
 			continue;
 		}
 
 		/* Load TXG log spacemap into ms_unflushed_allocs/frees. */
 		kpreempt(KPREEMPT_SYNC);
 		ASSERT0(sls->sls_nblocks);
 		sls->sls_nblocks = space_map_nblocks(sls->sls_sm);
 		spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
 		summary_add_data(spa, sls->sls_txg,
 		    sls->sls_mscount, 0, sls->sls_nblocks);
 
 		spa_import_progress_set_notes_nolog(spa,
 		    "Read %llu of %lu log space maps", (u_longlong_t)nsm,
 		    avl_numnodes(&spa->spa_sm_logs_by_txg));
 
 		struct spa_ld_log_sm_arg vla = {
 			.slls_spa = spa,
 			.slls_txg = sls->sls_txg
 		};
 		error = space_map_iterate(sls->sls_sm,
 		    space_map_length(sls->sls_sm), spa_ld_log_sm_cb, &vla);
 		if (error != 0) {
 			spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
 			    "at space_map_iterate(obj=%llu) [error %d]",
 			    (u_longlong_t)sls->sls_sm_obj, error);
 			goto out;
 		}
 
 		pn--;
 		ps -= space_map_length(sls->sls_sm);
 		nsm++;
 		space_map_close(sls->sls_sm);
 		sls->sls_sm = NULL;
 		sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls);
 
 		/* Update log block limits considering just loaded. */
 		spa_log_sm_set_blocklimit(spa);
 	}
 
 	hrtime_t read_logs_endtime = gethrtime();
 	spa_load_note(spa,
 	    "Read %lu log space maps (%llu total blocks - blksz = %llu bytes) "
 	    "in %lld ms", avl_numnodes(&spa->spa_sm_logs_by_txg),
 	    (u_longlong_t)spa_log_sm_nblocks(spa),
 	    (u_longlong_t)zfs_log_sm_blksz,
 	    (longlong_t)NSEC2MSEC(read_logs_endtime - read_logs_starttime));
 
 out:
 	if (error != 0) {
 		for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 		    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
 			if (sls->sls_sm) {
 				space_map_close(sls->sls_sm);
 				sls->sls_sm = NULL;
 			}
 		}
 	} else {
 		ASSERT0(pn);
 		ASSERT0(ps);
 	}
 	/*
 	 * Now that the metaslabs contain their unflushed changes:
 	 * [1] recalculate their actual allocated space
 	 * [2] recalculate their weights
 	 * [3] sum up the memory usage of their unflushed range trees
 	 * [4] optionally load them, if debug_load is set
 	 *
 	 * Note that even in the case where we get here because of an
 	 * error (e.g. error != 0), we still want to update the fields
 	 * below in order to have a proper teardown in spa_unload().
 	 */
 	for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
 	    m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
 		mutex_enter(&m->ms_lock);
 		m->ms_allocated_space = space_map_allocated(m->ms_sm) +
 		    range_tree_space(m->ms_unflushed_allocs) -
 		    range_tree_space(m->ms_unflushed_frees);
 
 		vdev_t *vd = m->ms_group->mg_vd;
 		metaslab_space_update(vd, m->ms_group->mg_class,
 		    range_tree_space(m->ms_unflushed_allocs), 0, 0);
 		metaslab_space_update(vd, m->ms_group->mg_class,
 		    -range_tree_space(m->ms_unflushed_frees), 0, 0);
 
 		ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK);
 		metaslab_recalculate_weight_and_sort(m);
 
 		spa->spa_unflushed_stats.sus_memused +=
 		    metaslab_unflushed_changes_memused(m);
 
 		if (metaslab_debug_load && m->ms_sm != NULL) {
 			VERIFY0(metaslab_load(m));
 			metaslab_set_selected_txg(m, 0);
 		}
 		mutex_exit(&m->ms_lock);
 	}
 
 	return (error);
 }
 
 static int
 spa_ld_unflushed_txgs(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
 
 	if (vd->vdev_top_zap == 0)
 		return (0);
 
 	uint64_t object = 0;
 	int error = zap_lookup(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
 	    sizeof (uint64_t), 1, &object);
 	if (error == ENOENT)
 		return (0);
 	else if (error != 0) {
 		spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at "
 		    "zap_lookup(vdev_top_zap=%llu) [error %d]",
 		    (u_longlong_t)vd->vdev_top_zap, error);
 		return (error);
 	}
 
 	for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *ms = vd->vdev_ms[m];
 		ASSERT(ms != NULL);
 
 		metaslab_unflushed_phys_t entry;
 		uint64_t entry_size = sizeof (entry);
 		uint64_t entry_offset = ms->ms_id * entry_size;
 
 		error = dmu_read(mos, object,
 		    entry_offset, entry_size, &entry, 0);
 		if (error != 0) {
 			spa_load_failed(spa, "spa_ld_unflushed_txgs(): "
 			    "failed at dmu_read(obj=%llu) [error %d]",
 			    (u_longlong_t)object, error);
 			return (error);
 		}
 
 		ms->ms_unflushed_txg = entry.msp_unflushed_txg;
 		ms->ms_unflushed_dirty = B_FALSE;
 		ASSERT(range_tree_is_empty(ms->ms_unflushed_allocs));
 		ASSERT(range_tree_is_empty(ms->ms_unflushed_frees));
 		if (ms->ms_unflushed_txg != 0) {
 			mutex_enter(&spa->spa_flushed_ms_lock);
 			avl_add(&spa->spa_metaslabs_by_flushed, ms);
 			mutex_exit(&spa->spa_flushed_ms_lock);
 		}
 	}
 	return (0);
 }
 
 /*
  * Read all the log space map entries into their respective
  * metaslab unflushed trees and keep them sorted by TXG in the
  * SPA's metadata. In addition, setup all the metadata for the
  * memory and the block heuristics.
  */
 int
 spa_ld_log_spacemaps(spa_t *spa)
 {
 	int error;
 
 	spa_log_sm_set_blocklimit(spa);
 
 	for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
 		error = spa_ld_unflushed_txgs(vd);
 		if (error != 0)
 			return (error);
 	}
 
 	error = spa_ld_log_sm_metadata(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Note: we don't actually expect anything to change at this point
 	 * but we grab the config lock so we don't fail any assertions
 	 * when using vdev_lookup_top().
 	 */
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	error = spa_ld_log_sm_data(spa);
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	return (error);
 }
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, U64, ZMOD_RW,
 	"Specific hard-limit in memory that ZFS allows to be used for "
 	"unflushed changes");
 
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, U64, ZMOD_RW,
 	"Percentage of the overall system memory that ZFS allows to be "
 	"used for unflushed changes (value is calculated over 1000000 for "
 	"finer granularity)");
 
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, U64, ZMOD_RW,
 	"Hard limit (upper-bound) in the size of the space map log "
 	"in terms of blocks.");
 
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, U64, ZMOD_RW,
 	"Lower-bound limit for the maximum amount of blocks allowed in "
 	"log spacemap (see zfs_unflushed_log_block_max)");
 
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, U64, ZMOD_RW,
     "Hard limit (upper-bound) in the size of the space map log "
     "in terms of dirty TXGs.");
 
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, UINT, ZMOD_RW,
 	"Tunable used to determine the number of blocks that can be used for "
 	"the spacemap log, expressed as a percentage of the total number of "
 	"metaslabs in the pool (e.g. 400 means the number of log blocks is "
 	"capped at 4 times the number of metaslabs)");
 
 ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, U64, ZMOD_RW,
 	"The number of past TXGs that the flushing algorithm of the log "
 	"spacemap feature uses to estimate incoming log blocks");
 
 ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW,
 	"Prevent the log spacemaps from being flushed and destroyed "
 	"during pool export/destroy");
 /* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, U64, ZMOD_RW,
 	"Maximum number of rows allowed in the summary of the spacemap log");
 
 ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, U64, ZMOD_RW,
 	"Minimum number of metaslabs to flush per dirty TXG");
diff --git a/module/zfs/uberblock.c b/module/zfs/uberblock.c
index 1921be107660..22ee8036c473 100644
--- a/module/zfs/uberblock.c
+++ b/module/zfs/uberblock.c
@@ -1,74 +1,74 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/uberblock_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/mmp.h>
 
 int
 uberblock_verify(uberblock_t *ub)
 {
 	if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC))
 		byteswap_uint64_array(ub, sizeof (uberblock_t));
 
 	if (ub->ub_magic != UBERBLOCK_MAGIC)
 		return (SET_ERROR(EINVAL));
 
 	return (0);
 }
 
 /*
  * Update the uberblock and return TRUE if anything changed in this
  * transaction group.
  */
 boolean_t
 uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay)
 {
 	ASSERT(ub->ub_txg < txg);
 
 	/*
 	 * We explicitly do not set ub_version here, so that older versions
 	 * continue to be written with the previous uberblock version.
 	 */
 	ub->ub_magic = UBERBLOCK_MAGIC;
 	ub->ub_txg = txg;
 	ub->ub_guid_sum = rvd->vdev_guid_sum;
 	ub->ub_timestamp = gethrestime_sec();
 	ub->ub_software_version = SPA_VERSION;
 	ub->ub_mmp_magic = MMP_MAGIC;
 	if (spa_multihost(rvd->vdev_spa)) {
 		ub->ub_mmp_delay = mmp_delay;
 		ub->ub_mmp_config = MMP_SEQ_SET(0) |
 		    MMP_INTERVAL_SET(zfs_multihost_interval) |
 		    MMP_FAIL_INT_SET(zfs_multihost_fail_intervals);
 	} else {
 		ub->ub_mmp_delay = 0;
 		ub->ub_mmp_config = 0;
 	}
 	ub->ub_checkpoint_txg = 0;
 
-	return (ub->ub_rootbp.blk_birth == txg);
+	return (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) == txg);
 }
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index f9a01c9f53f4..102eacb03349 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -1,1040 +1,1040 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_scan.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/abd.h>
 #include <sys/fs/zfs.h>
 
 /*
  * Vdev mirror kstats
  */
 static kstat_t *mirror_ksp = NULL;
 
 typedef struct mirror_stats {
 	kstat_named_t vdev_mirror_stat_rotating_linear;
 	kstat_named_t vdev_mirror_stat_rotating_offset;
 	kstat_named_t vdev_mirror_stat_rotating_seek;
 	kstat_named_t vdev_mirror_stat_non_rotating_linear;
 	kstat_named_t vdev_mirror_stat_non_rotating_seek;
 
 	kstat_named_t vdev_mirror_stat_preferred_found;
 	kstat_named_t vdev_mirror_stat_preferred_not_found;
 } mirror_stats_t;
 
 static mirror_stats_t mirror_stats = {
 	/* New I/O follows directly the last I/O */
 	{ "rotating_linear",			KSTAT_DATA_UINT64 },
 	/* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */
 	{ "rotating_offset",			KSTAT_DATA_UINT64 },
 	/* New I/O requires random seek */
 	{ "rotating_seek",			KSTAT_DATA_UINT64 },
 	/* New I/O follows directly the last I/O  (nonrot) */
 	{ "non_rotating_linear",		KSTAT_DATA_UINT64 },
 	/* New I/O requires random seek (nonrot) */
 	{ "non_rotating_seek",			KSTAT_DATA_UINT64 },
 	/* Preferred child vdev found */
 	{ "preferred_found",			KSTAT_DATA_UINT64 },
 	/* Preferred child vdev not found or equal load  */
 	{ "preferred_not_found",		KSTAT_DATA_UINT64 },
 
 };
 
 #define	MIRROR_STAT(stat)		(mirror_stats.stat.value.ui64)
 #define	MIRROR_INCR(stat, val) 		atomic_add_64(&MIRROR_STAT(stat), val)
 #define	MIRROR_BUMP(stat)		MIRROR_INCR(stat, 1)
 
 void
 vdev_mirror_stat_init(void)
 {
 	mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats",
 	    "misc", KSTAT_TYPE_NAMED,
 	    sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 	if (mirror_ksp != NULL) {
 		mirror_ksp->ks_data = &mirror_stats;
 		kstat_install(mirror_ksp);
 	}
 }
 
 void
 vdev_mirror_stat_fini(void)
 {
 	if (mirror_ksp != NULL) {
 		kstat_delete(mirror_ksp);
 		mirror_ksp = NULL;
 	}
 }
 
 /*
  * Virtual device vector for mirroring.
  */
 typedef struct mirror_child {
 	vdev_t		*mc_vd;
 	abd_t		*mc_abd;
 	uint64_t	mc_offset;
 	int		mc_error;
 	int		mc_load;
 	uint8_t		mc_tried;
 	uint8_t		mc_skipped;
 	uint8_t		mc_speculative;
 	uint8_t		mc_rebuilding;
 } mirror_child_t;
 
 typedef struct mirror_map {
 	int		*mm_preferred;
 	int		mm_preferred_cnt;
 	int		mm_children;
 	boolean_t	mm_resilvering;
 	boolean_t	mm_rebuilding;
 	boolean_t	mm_root;
 	mirror_child_t	mm_child[];
 } mirror_map_t;
 
 static const int vdev_mirror_shift = 21;
 
 /*
  * The load configuration settings below are tuned by default for
  * the case where all devices are of the same rotational type.
  *
  * If there is a mixture of rotating and non-rotating media, setting
  * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results
  * as it will direct more reads to the non-rotating vdevs which are more likely
  * to have a higher performance.
  */
 
 /* Rotating media load calculation configuration. */
 static int zfs_vdev_mirror_rotating_inc = 0;
 static int zfs_vdev_mirror_rotating_seek_inc = 5;
 static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024;
 
 /* Non-rotating media load calculation configuration. */
 static int zfs_vdev_mirror_non_rotating_inc = 0;
 static int zfs_vdev_mirror_non_rotating_seek_inc = 1;
 
 static inline size_t
 vdev_mirror_map_size(int children)
 {
 	return (offsetof(mirror_map_t, mm_child[children]) +
 	    sizeof (int) * children);
 }
 
 static inline mirror_map_t *
 vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
 {
 	mirror_map_t *mm;
 
 	mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
 	mm->mm_children = children;
 	mm->mm_resilvering = resilvering;
 	mm->mm_root = root;
 	mm->mm_preferred = (int *)((uintptr_t)mm +
 	    offsetof(mirror_map_t, mm_child[children]));
 
 	return (mm);
 }
 
 static void
 vdev_mirror_map_free(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 
 	kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
 }
 
 static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
 	.vsd_free = vdev_mirror_map_free,
 };
 
 static int
 vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
 {
 	uint64_t last_offset;
 	int64_t offset_diff;
 	int load;
 
 	/* All DVAs have equal weight at the root. */
 	if (mm->mm_root)
 		return (INT_MAX);
 
 	/*
 	 * We don't return INT_MAX if the device is resilvering i.e.
 	 * vdev_resilver_txg != 0 as when tested performance was slightly
 	 * worse overall when resilvering with compared to without.
 	 */
 
 	/* Fix zio_offset for leaf vdevs */
 	if (vd->vdev_ops->vdev_op_leaf)
 		zio_offset += VDEV_LABEL_START_SIZE;
 
 	/* Standard load based on pending queue length. */
 	load = vdev_queue_length(vd);
 	last_offset = vdev_queue_last_offset(vd);
 
 	if (vd->vdev_nonrot) {
 		/* Non-rotating media. */
 		if (last_offset == zio_offset) {
 			MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear);
 			return (load + zfs_vdev_mirror_non_rotating_inc);
 		}
 
 		/*
 		 * Apply a seek penalty even for non-rotating devices as
 		 * sequential I/O's can be aggregated into fewer operations on
 		 * the device, thus avoiding unnecessary per-command overhead
 		 * and boosting performance.
 		 */
 		MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek);
 		return (load + zfs_vdev_mirror_non_rotating_seek_inc);
 	}
 
 	/* Rotating media I/O's which directly follow the last I/O. */
 	if (last_offset == zio_offset) {
 		MIRROR_BUMP(vdev_mirror_stat_rotating_linear);
 		return (load + zfs_vdev_mirror_rotating_inc);
 	}
 
 	/*
 	 * Apply half the seek increment to I/O's within seek offset
 	 * of the last I/O issued to this vdev as they should incur less
 	 * of a seek increment.
 	 */
 	offset_diff = (int64_t)(last_offset - zio_offset);
 	if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) {
 		MIRROR_BUMP(vdev_mirror_stat_rotating_offset);
 		return (load + (zfs_vdev_mirror_rotating_seek_inc / 2));
 	}
 
 	/* Apply the full seek increment to all other I/O's. */
 	MIRROR_BUMP(vdev_mirror_stat_rotating_seek);
 	return (load + zfs_vdev_mirror_rotating_seek_inc);
 }
 
 static boolean_t
 vdev_mirror_rebuilding(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
 		return (B_TRUE);
 
 	for (int i = 0; i < vd->vdev_children; i++) {
 		if (vdev_mirror_rebuilding(vd->vdev_child[i])) {
 			return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Avoid inlining the function to keep vdev_mirror_io_start(), which
  * is this functions only caller, as small as possible on the stack.
  */
 noinline static mirror_map_t *
 vdev_mirror_map_init(zio_t *zio)
 {
 	mirror_map_t *mm = NULL;
 	mirror_child_t *mc;
 	vdev_t *vd = zio->io_vd;
 	int c;
 
 	if (vd == NULL) {
 		dva_t *dva = zio->io_bp->blk_dva;
 		spa_t *spa = zio->io_spa;
 		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 		dva_t dva_copy[SPA_DVAS_PER_BP];
 
 		/*
 		 * The sequential scrub code sorts and issues all DVAs
 		 * of a bp separately. Each of these IOs includes all
 		 * original DVA copies so that repairs can be performed
 		 * in the event of an error, but we only actually want
 		 * to check the first DVA since the others will be
 		 * checked by their respective sorted IOs. Only if we
 		 * hit an error will we try all DVAs upon retrying.
 		 *
 		 * Note: This check is safe even if the user switches
 		 * from a legacy scrub to a sequential one in the middle
 		 * of processing, since scn_is_sorted isn't updated until
 		 * all outstanding IOs from the previous scrub pass
 		 * complete.
 		 */
 		if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
 		    !(zio->io_flags & ZIO_FLAG_IO_RETRY) &&
 		    dsl_scan_scrubbing(spa->spa_dsl_pool) &&
 		    scn->scn_is_sorted) {
 			c = 1;
 		} else {
 			c = BP_GET_NDVAS(zio->io_bp);
 		}
 
 		/*
 		 * If the pool cannot be written to, then infer that some
 		 * DVAs might be invalid or point to vdevs that do not exist.
 		 * We skip them.
 		 */
 		if (!spa_writeable(spa)) {
 			ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 			int j = 0;
 			for (int i = 0; i < c; i++) {
 				if (zfs_dva_valid(spa, &dva[i], zio->io_bp))
 					dva_copy[j++] = dva[i];
 			}
 			if (j == 0) {
 				zio->io_vsd = NULL;
 				zio->io_error = ENXIO;
 				return (NULL);
 			}
 			if (j < c) {
 				dva = dva_copy;
 				c = j;
 			}
 		}
 
 		mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 
 			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
 			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
 			if (mc->mc_vd == NULL) {
 				kmem_free(mm, vdev_mirror_map_size(
 				    mm->mm_children));
 				zio->io_vsd = NULL;
 				zio->io_error = ENXIO;
 				return (NULL);
 			}
 		}
 	} else {
 		/*
 		 * If we are resilvering, then we should handle scrub reads
 		 * differently; we shouldn't issue them to the resilvering
 		 * device because it might not have those blocks.
 		 *
 		 * We are resilvering iff:
 		 * 1) We are a replacing vdev (ie our name is "replacing-1" or
 		 *    "spare-1" or something like that), and
 		 * 2) The pool is currently being resilvered.
 		 *
 		 * We cannot simply check vd->vdev_resilver_txg, because it's
 		 * not set in this path.
 		 *
 		 * Nor can we just check our vdev_ops; there are cases (such as
 		 * when a user types "zpool replace pool odev spare_dev" and
 		 * spare_dev is in the spare list, or when a spare device is
 		 * automatically used to replace a DEGRADED device) when
 		 * resilvering is complete but both the original vdev and the
 		 * spare vdev remain in the pool.  That behavior is intentional.
 		 * It helps implement the policy that a spare should be
 		 * automatically removed from the pool after the user replaces
 		 * the device that originally failed.
 		 *
 		 * If a spa load is in progress, then spa_dsl_pool may be
 		 * uninitialized.  But we shouldn't be resilvering during a spa
 		 * load anyway.
 		 */
 		boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
 		    vd->vdev_ops == &vdev_spare_ops) &&
 		    spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
 		    dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);
 		mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
 		    B_FALSE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 			mc->mc_vd = vd->vdev_child[c];
 			mc->mc_offset = zio->io_offset;
 
 			if (vdev_mirror_rebuilding(mc->mc_vd))
 				mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE;
 		}
 	}
 
 	return (mm);
 }
 
 static int
 vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	int numerrors = 0;
 	int lasterror = 0;
 
 	if (vd->vdev_children == 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	vdev_open_children(vd);
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error) {
 			lasterror = cvd->vdev_open_error;
 			numerrors++;
 			continue;
 		}
 
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
 	}
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error)
 			continue;
 		*physical_ashift = vdev_best_ashift(*logical_ashift,
 		    *physical_ashift, cvd->vdev_physical_ashift);
 	}
 
 	if (numerrors == vd->vdev_children) {
 		if (vdev_children_are_offline(vd))
 			vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE;
 		else
 			vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 		return (lasterror);
 	}
 
 	return (0);
 }
 
 static void
 vdev_mirror_close(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_close(vd->vdev_child[c]);
 }
 
 static void
 vdev_mirror_child_done(zio_t *zio)
 {
 	mirror_child_t *mc = zio->io_private;
 
 	mc->mc_error = zio->io_error;
 	mc->mc_tried = 1;
 	mc->mc_skipped = 0;
 }
 
 /*
  * Check the other, lower-index DVAs to see if they're on the same
  * vdev as the child we picked.  If they are, use them since they
  * are likely to have been allocated from the primary metaslab in
  * use at the time, and hence are more likely to have locality with
  * single-copy data.
  */
 static int
 vdev_mirror_dva_select(zio_t *zio, int p)
 {
 	dva_t *dva = zio->io_bp->blk_dva;
 	mirror_map_t *mm = zio->io_vsd;
 	int preferred;
 	int c;
 
 	preferred = mm->mm_preferred[p];
 	for (p--; p >= 0; p--) {
 		c = mm->mm_preferred[p];
 		if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
 			preferred = c;
 	}
 	return (preferred);
 }
 
 static int
 vdev_mirror_preferred_child_randomize(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 	int p;
 
 	if (mm->mm_root) {
 		p = random_in_range(mm->mm_preferred_cnt);
 		return (vdev_mirror_dva_select(zio, p));
 	}
 
 	/*
 	 * To ensure we don't always favour the first matching vdev,
 	 * which could lead to wear leveling issues on SSD's, we
 	 * use the I/O offset as a pseudo random seed into the vdevs
 	 * which have the lowest load.
 	 */
 	p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
 	return (mm->mm_preferred[p]);
 }
 
 static boolean_t
 vdev_mirror_child_readable(mirror_child_t *mc)
 {
 	vdev_t *vd = mc->mc_vd;
 
 	if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
 		return (vdev_draid_readable(vd, mc->mc_offset));
 	else
 		return (vdev_readable(vd));
 }
 
 static boolean_t
 vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size)
 {
 	vdev_t *vd = mc->mc_vd;
 
 	if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
 		return (vdev_draid_missing(vd, mc->mc_offset, txg, size));
 	else
 		return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
 }
 
 /*
  * Try to find a vdev whose DTL doesn't contain the block we want to read
  * preferring vdevs based on determined load. If we can't, try the read on
  * any vdev we haven't already tried.
  *
  * Distributed spares are an exception to the above load rule. They are
  * always preferred in order to detect gaps in the distributed spare which
  * are created when another disk in the dRAID fails. In order to restore
  * redundancy those gaps must be read to trigger the required repair IO.
  */
 static int
 vdev_mirror_child_select(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 	uint64_t txg = zio->io_txg;
 	int c, lowest_load;
 
-	ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
+	ASSERT(zio->io_bp == NULL || BP_GET_BIRTH(zio->io_bp) == txg);
 
 	lowest_load = INT_MAX;
 	mm->mm_preferred_cnt = 0;
 	for (c = 0; c < mm->mm_children; c++) {
 		mirror_child_t *mc;
 
 		mc = &mm->mm_child[c];
 		if (mc->mc_tried || mc->mc_skipped)
 			continue;
 
 		if (mc->mc_vd == NULL ||
 		    !vdev_mirror_child_readable(mc)) {
 			mc->mc_error = SET_ERROR(ENXIO);
 			mc->mc_tried = 1;	/* don't even try */
 			mc->mc_skipped = 1;
 			continue;
 		}
 
 		if (vdev_mirror_child_missing(mc, txg, 1)) {
 			mc->mc_error = SET_ERROR(ESTALE);
 			mc->mc_skipped = 1;
 			mc->mc_speculative = 1;
 			continue;
 		}
 
 		if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) {
 			mm->mm_preferred[0] = c;
 			mm->mm_preferred_cnt = 1;
 			break;
 		}
 
 		mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
 		if (mc->mc_load > lowest_load)
 			continue;
 
 		if (mc->mc_load < lowest_load) {
 			lowest_load = mc->mc_load;
 			mm->mm_preferred_cnt = 0;
 		}
 		mm->mm_preferred[mm->mm_preferred_cnt] = c;
 		mm->mm_preferred_cnt++;
 	}
 
 	if (mm->mm_preferred_cnt == 1) {
 		MIRROR_BUMP(vdev_mirror_stat_preferred_found);
 		return (mm->mm_preferred[0]);
 	}
 
 	if (mm->mm_preferred_cnt > 1) {
 		MIRROR_BUMP(vdev_mirror_stat_preferred_not_found);
 		return (vdev_mirror_preferred_child_randomize(zio));
 	}
 
 	/*
 	 * Every device is either missing or has this txg in its DTL.
 	 * Look for any child we haven't already tried before giving up.
 	 */
 	for (c = 0; c < mm->mm_children; c++) {
 		if (!mm->mm_child[c].mc_tried)
 			return (c);
 	}
 
 	/*
 	 * Every child failed.  There's no place left to look.
 	 */
 	return (-1);
 }
 
 static void
 vdev_mirror_io_start(zio_t *zio)
 {
 	mirror_map_t *mm;
 	mirror_child_t *mc;
 	int c, children;
 
 	mm = vdev_mirror_map_init(zio);
 	zio->io_vsd = mm;
 	zio->io_vsd_ops = &vdev_mirror_vsd_ops;
 
 	if (mm == NULL) {
 		ASSERT(!spa_trust_config(zio->io_spa));
 		ASSERT(zio->io_type == ZIO_TYPE_READ);
 		zio_execute(zio);
 		return;
 	}
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
 			/*
 			 * For scrubbing reads we need to issue reads to all
 			 * children.  One child can reuse parent buffer, but
 			 * for others we have to allocate separate ones to
 			 * verify checksums if io_bp is non-NULL, or compare
 			 * them in vdev_mirror_io_done() otherwise.
 			 */
 			boolean_t first = B_TRUE;
 			for (c = 0; c < mm->mm_children; c++) {
 				mc = &mm->mm_child[c];
 
 				/* Don't issue ZIOs to offline children */
 				if (!vdev_mirror_child_readable(mc)) {
 					mc->mc_error = SET_ERROR(ENXIO);
 					mc->mc_tried = 1;
 					mc->mc_skipped = 1;
 					continue;
 				}
 
 				mc->mc_abd = first ? zio->io_abd :
 				    abd_alloc_sametype(zio->io_abd,
 				    zio->io_size);
 				zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 				    mc->mc_vd, mc->mc_offset, mc->mc_abd,
 				    zio->io_size, zio->io_type,
 				    zio->io_priority, 0,
 				    vdev_mirror_child_done, mc));
 				first = B_FALSE;
 			}
 			zio_execute(zio);
 			return;
 		}
 		/*
 		 * For normal reads just pick one child.
 		 */
 		c = vdev_mirror_child_select(zio);
 		children = (c >= 0);
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 
 		/*
 		 * Writes go to all children.
 		 */
 		c = 0;
 		children = mm->mm_children;
 	}
 
 	while (children--) {
 		mc = &mm->mm_child[c];
 		c++;
 
 		/*
 		 * When sequentially resilvering only issue write repair
 		 * IOs to the vdev which is being rebuilt since performance
 		 * is limited by the slowest child.  This is an issue for
 		 * faster replacement devices such as distributed spares.
 		 */
 		if ((zio->io_priority == ZIO_PRIORITY_REBUILD) &&
 		    (zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 		    !(zio->io_flags & ZIO_FLAG_SCRUB) &&
 		    mm->mm_rebuilding && !mc->mc_rebuilding) {
 			continue;
 		}
 
 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 		    mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
 		    zio->io_type, zio->io_priority, 0,
 		    vdev_mirror_child_done, mc));
 	}
 
 	zio_execute(zio);
 }
 
 static int
 vdev_mirror_worst_error(mirror_map_t *mm)
 {
 	int error[2] = { 0, 0 };
 
 	for (int c = 0; c < mm->mm_children; c++) {
 		mirror_child_t *mc = &mm->mm_child[c];
 		int s = mc->mc_speculative;
 		error[s] = zio_worst_error(error[s], mc->mc_error);
 	}
 
 	return (error[0] ? error[0] : error[1]);
 }
 
 static void
 vdev_mirror_io_done(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 	mirror_child_t *mc;
 	int c;
 	int good_copies = 0;
 	int unexpected_errors = 0;
 	int last_good_copy = -1;
 
 	if (mm == NULL)
 		return;
 
 	for (c = 0; c < mm->mm_children; c++) {
 		mc = &mm->mm_child[c];
 
 		if (mc->mc_error) {
 			if (!mc->mc_skipped)
 				unexpected_errors++;
 		} else if (mc->mc_tried) {
 			last_good_copy = c;
 			good_copies++;
 		}
 	}
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		/*
 		 * XXX -- for now, treat partial writes as success.
 		 *
 		 * Now that we support write reallocation, it would be better
 		 * to treat partial failure as real failure unless there are
 		 * no non-degraded top-level vdevs left, and not update DTLs
 		 * if we intend to reallocate.
 		 */
 		if (good_copies != mm->mm_children) {
 			/*
 			 * Always require at least one good copy.
 			 *
 			 * For ditto blocks (io_vd == NULL), require
 			 * all copies to be good.
 			 *
 			 * XXX -- for replacing vdevs, there's no great answer.
 			 * If the old device is really dead, we may not even
 			 * be able to access it -- so we only want to
 			 * require good writes to the new device.  But if
 			 * the new device turns out to be flaky, we want
 			 * to be able to detach it -- which requires all
 			 * writes to the old device to have succeeded.
 			 */
 			if (good_copies == 0 || zio->io_vd == NULL)
 				zio->io_error = vdev_mirror_worst_error(mm);
 		}
 		return;
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
 
 	/*
 	 * If we don't have a good copy yet, keep trying other children.
 	 */
 	if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
 		ASSERT(c >= 0 && c < mm->mm_children);
 		mc = &mm->mm_child[c];
 		zio_vdev_io_redone(zio);
 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 		    mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
 		    ZIO_TYPE_READ, zio->io_priority, 0,
 		    vdev_mirror_child_done, mc));
 		return;
 	}
 
 	if (zio->io_flags & ZIO_FLAG_SCRUB && !mm->mm_resilvering) {
 		abd_t *best_abd = NULL;
 		if (last_good_copy >= 0)
 			best_abd = mm->mm_child[last_good_copy].mc_abd;
 
 		/*
 		 * If we're scrubbing but don't have a BP available (because
 		 * this vdev is under a raidz or draid vdev) then the best we
 		 * can do is compare all of the copies read.  If they're not
 		 * identical then return a checksum error and the most likely
 		 * correct data.  The raidz code will issue a repair I/O if
 		 * possible.
 		 */
 		if (zio->io_bp == NULL) {
 			ASSERT(zio->io_vd->vdev_ops == &vdev_replacing_ops ||
 			    zio->io_vd->vdev_ops == &vdev_spare_ops);
 
 			abd_t *pref_abd = NULL;
 			for (c = 0; c < last_good_copy; c++) {
 				mc = &mm->mm_child[c];
 				if (mc->mc_error || !mc->mc_tried)
 					continue;
 
 				if (abd_cmp(mc->mc_abd, best_abd) != 0)
 					zio->io_error = SET_ERROR(ECKSUM);
 
 				/*
 				 * The distributed spare is always prefered
 				 * by vdev_mirror_child_select() so it's
 				 * considered to be the best candidate.
 				 */
 				if (pref_abd == NULL &&
 				    mc->mc_vd->vdev_ops ==
 				    &vdev_draid_spare_ops)
 					pref_abd = mc->mc_abd;
 
 				/*
 				 * In the absence of a preferred copy, use
 				 * the parent pointer to avoid a memory copy.
 				 */
 				if (mc->mc_abd == zio->io_abd)
 					best_abd = mc->mc_abd;
 			}
 			if (pref_abd)
 				best_abd = pref_abd;
 		} else {
 
 			/*
 			 * If we have a BP available, then checksums are
 			 * already verified and we just need a buffer
 			 * with valid data, preferring parent one to
 			 * avoid a memory copy.
 			 */
 			for (c = 0; c < last_good_copy; c++) {
 				mc = &mm->mm_child[c];
 				if (mc->mc_error || !mc->mc_tried)
 					continue;
 				if (mc->mc_abd == zio->io_abd) {
 					best_abd = mc->mc_abd;
 					break;
 				}
 			}
 		}
 
 		if (best_abd && best_abd != zio->io_abd)
 			abd_copy(zio->io_abd, best_abd, zio->io_size);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 			if (mc->mc_abd != zio->io_abd)
 				abd_free(mc->mc_abd);
 			mc->mc_abd = NULL;
 		}
 	}
 
 	if (good_copies == 0) {
 		zio->io_error = vdev_mirror_worst_error(mm);
 		ASSERT(zio->io_error != 0);
 	}
 
 	if (good_copies && spa_writeable(zio->io_spa) &&
 	    (unexpected_errors ||
 	    (zio->io_flags & ZIO_FLAG_RESILVER) ||
 	    ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) {
 		/*
 		 * Use the good data we have in hand to repair damaged children.
 		 */
 		for (c = 0; c < mm->mm_children; c++) {
 			/*
 			 * Don't rewrite known good children.
 			 * Not only is it unnecessary, it could
 			 * actually be harmful: if the system lost
 			 * power while rewriting the only good copy,
 			 * there would be no good copies left!
 			 */
 			mc = &mm->mm_child[c];
 
 			if (mc->mc_error == 0) {
 				vdev_ops_t *ops = mc->mc_vd->vdev_ops;
 
 				if (mc->mc_tried)
 					continue;
 				/*
 				 * We didn't try this child.  We need to
 				 * repair it if:
 				 * 1. it's a scrub (in which case we have
 				 * tried everything that was healthy)
 				 *  - or -
 				 * 2. it's an indirect or distributed spare
 				 * vdev (in which case it could point to any
 				 * other vdev, which might have a bad DTL)
 				 *  - or -
 				 * 3. the DTL indicates that this data is
 				 * missing from this vdev
 				 */
 				if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
 				    ops != &vdev_indirect_ops &&
 				    ops != &vdev_draid_spare_ops &&
 				    !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
 				    zio->io_txg, 1))
 					continue;
 				mc->mc_error = SET_ERROR(ESTALE);
 			}
 
 			zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 			    mc->mc_vd, mc->mc_offset,
 			    zio->io_abd, zio->io_size, ZIO_TYPE_WRITE,
 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
 	}
 }
 
 static void
 vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	if (faulted == vd->vdev_children) {
 		if (vdev_children_are_offline(vd)) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE,
 			    VDEV_AUX_CHILDREN_OFFLINE);
 		} else {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_NO_REPLICAS);
 		}
 	} else if (degraded + faulted != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	} else {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 	}
 }
 
 /*
  * Return the maximum asize for a rebuild zio in the provided range.
  */
 static uint64_t
 vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
     uint64_t max_segment)
 {
 	(void) start;
 
 	uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift),
 	    SPA_MAXBLOCKSIZE);
 
 	return (MIN(asize, vdev_psize_to_asize(vd, psize)));
 }
 
 vdev_ops_t vdev_mirror_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_mirror_open,
 	.vdev_op_close = vdev_mirror_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_mirror_io_start,
 	.vdev_op_io_done = vdev_mirror_io_done,
 	.vdev_op_state_change = vdev_mirror_state_change,
 	.vdev_op_need_resilver = vdev_default_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_MIRROR,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
 
 vdev_ops_t vdev_replacing_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_mirror_open,
 	.vdev_op_close = vdev_mirror_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_mirror_io_start,
 	.vdev_op_io_done = vdev_mirror_io_done,
 	.vdev_op_state_change = vdev_mirror_state_change,
 	.vdev_op_need_resilver = vdev_default_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_REPLACING,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
 
 vdev_ops_t vdev_spare_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_mirror_open,
 	.vdev_op_close = vdev_mirror_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_mirror_io_start,
 	.vdev_op_io_done = vdev_mirror_io_done,
 	.vdev_op_state_change = vdev_mirror_state_change,
 	.vdev_op_need_resilver = vdev_default_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_SPARE,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
 
 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, INT, ZMOD_RW,
 	"Rotating media load increment for non-seeking I/Os");
 
 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT,
 	ZMOD_RW, "Rotating media load increment for seeking I/Os");
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT,
 	ZMOD_RW,
 	"Offset in bytes from the last I/O which triggers "
 	"a reduced rotating media seek increment");
 /* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT,
 	ZMOD_RW, "Non-rotating media load increment for non-seeking I/Os");
 
 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT,
 	ZMOD_RW, "Non-rotating media load increment for seeking I/Os");
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 9d0b8763f16f..b03331ec69c6 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -1,5021 +1,5020 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zap.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu_tx.h>
 #include <sys/abd.h>
 #include <sys/zfs_rlock.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_raidz_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/uberblock_impl.h>
 #include <sys/dsl_scan.h>
 
 #ifdef ZFS_DEBUG
 #include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
 #endif
 
 /*
  * Virtual device vector for RAID-Z.
  *
  * This vdev supports single, double, and triple parity. For single parity,
  * we use a simple XOR of all the data columns. For double or triple parity,
  * we use a special case of Reed-Solomon coding. This extends the
  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  * former is also based. The latter is designed to provide higher performance
  * for writes.
  *
  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  * amended six years later identifying a critical flaw that invalidates its
  * claims. Nevertheless, the technique can be adapted to work for up to
  * triple parity. For additional parity, the amendment "Note: Correction to
  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  * is viable, but the additional complexity means that write performance will
  * suffer.
  *
  * All of the methods above operate on a Galois field, defined over the
  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  * can be expressed with a single byte. Briefly, the operations on the
  * field are defined as follows:
  *
  *   o addition (+) is represented by a bitwise XOR
  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  *   o multiplication of A by 2 is defined by the following bitwise expression:
  *
  *	(A * 2)_7 = A_6
  *	(A * 2)_6 = A_5
  *	(A * 2)_5 = A_4
  *	(A * 2)_4 = A_3 + A_7
  *	(A * 2)_3 = A_2 + A_7
  *	(A * 2)_2 = A_1 + A_7
  *	(A * 2)_1 = A_0
  *	(A * 2)_0 = A_7
  *
  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  * As an aside, this multiplication is derived from the error correcting
  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  *
  * Observe that any number in the field (except for 0) can be expressed as a
  * power of 2 -- a generator for the field. We store a table of the powers of
  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  * than field addition). The inverse of a field element A (A^-1) is therefore
  * A ^ (255 - 1) = A^254.
  *
  * The up-to-three parity columns, P, Q, R over several data columns,
  * D_0, ... D_n-1, can be expressed by field operations:
  *
  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  *
  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
  * independent coefficients. (There are no additional coefficients that have
  * this property which is why the uncorrected Plank method breaks down.)
  *
  * See the reconstruction code below for how P, Q and R can used individually
  * or in concert to recover missing data columns.
  */
 
 #define	VDEV_RAIDZ_P		0
 #define	VDEV_RAIDZ_Q		1
 #define	VDEV_RAIDZ_R		2
 
 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 
 /*
  * We provide a mechanism to perform the field multiplication operation on a
  * 64-bit value all at once rather than a byte at a time. This works by
  * creating a mask from the top bit in each byte and using that to
  * conditionally apply the XOR of 0x1d.
  */
 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
 { \
 	(mask) = (x) & 0x8080808080808080ULL; \
 	(mask) = ((mask) << 1) - ((mask) >> 7); \
 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
 }
 
 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
 { \
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 }
 
 
 /*
  * Big Theory Statement for how a RAIDZ VDEV is expanded
  *
  * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
  * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
  * that have been previously expanded can be expanded again.
  *
  * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
  * the VDEV) when an expansion starts.  And the expansion will pause if any
  * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
  * operations on the pool can continue while an expansion is in progress (e.g.
  * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
  * and zpool initialize which can't be run during an expansion.  Following a
  * reboot or export/import, the expansion resumes where it left off.
  *
  * == Reflowing the Data ==
  *
  * The expansion involves reflowing (copying) the data from the current set
  * of disks to spread it across the new set which now has one more disk. This
  * reflow operation is similar to reflowing text when the column width of a
  * text editor window is expanded. The text doesn’t change but the location of
  * the text changes to accommodate the new width. An example reflow result for
  * a 4-wide RAIDZ1 to a 5-wide is shown below.
  *
  *                            Reflow End State
  *            Each letter indicates a parity group (logical stripe)
  *
  *         Before expansion                         After Expansion
  *     D1     D2     D3     D4               D1     D2     D3     D4     D5
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  A   |  A   |  A   |  A   |         |  A   |  A   |  A   |  A   |  B   |
  *  |     1|     2|     3|     4|         |     1|     2|     3|     4|     5|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  B   |  B   |  C   |  C   |         |  B   |  C   |  C   |  C   |  C   |
  *  |     5|     6|     7|     8|         |     6|     7|     8|     9|    10|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  C   |  C   |  D   |  D   |         |  D   |  D   |  E   |  E   |  E   |
  *  |     9|    10|    11|    12|         |    11|    12|    13|    14|    15|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  E   |  E   |  E   |  E   |   -->   |  E   |  F   |  F   |  G   |  G   |
  *  |    13|    14|    15|    16|         |    16|    17|    18|p   19|    20|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  F   |  F   |  G   |  G   |         |  G   |  G   |  H   |  H   |  H   |
  *  |    17|    18|    19|    20|         |    21|    22|    23|    24|    25|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  G   |  G   |  H   |  H   |         |  H   |  I   |  I   |  J   |  J   |
  *  |    21|    22|    23|    24|         |    26|    27|    28|    29|    30|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  H   |  H   |  I   |  I   |         |  J   |  J   |      |      |  K   |
  *  |    25|    26|    27|    28|         |    31|    32|    33|    34|    35|
  *  +------+------+------+------+         +------+------+------+------+------+
  *
  * This reflow approach has several advantages. There is no need to read or
  * modify the block pointers or recompute any block checksums.  The reflow
  * doesn’t need to know where the parity sectors reside. We can read and write
  * data sequentially and the copy can occur in a background thread in open
  * context. The design also allows for fast discovery of what data to copy.
  *
  * The VDEV metaslabs are processed, one at a time, to copy the block data to
  * have it flow across all the disks. The metaslab is disabled for allocations
  * during the copy. As an optimization, we only copy the allocated data which
  * can be determined by looking at the metaslab range tree. During the copy we
  * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
  * need to be able to survive losing parity count disks).  This means we
  * cannot overwrite data during the reflow that would be needed if a disk is
  * lost.
  *
  * After the reflow completes, all newly-written blocks will have the new
  * layout, i.e., they will have the parity to data ratio implied by the new
  * number of disks in the RAIDZ group.  Even though the reflow copies all of
  * the allocated space (data and parity), it is only rearranged, not changed.
  *
  * This act of reflowing the data has a few implications about blocks
  * that were written before the reflow completes:
  *
  *  - Old blocks will still use the same amount of space (i.e., they will have
  *    the parity to data ratio implied by the old number of disks in the RAIDZ
  *    group).
  *  - Reading old blocks will be slightly slower than before the reflow, for
  *    two reasons. First, we will have to read from all disks in the RAIDZ
  *    VDEV, rather than being able to skip the children that contain only
  *    parity of this block (because the data of a single block is now spread
  *    out across all the disks).  Second, in most cases there will be an extra
  *    bcopy, needed to rearrange the data back to its original layout in memory.
  *
  * == Scratch Area ==
  *
  * As we copy the block data, we can only progress to the point that writes
  * will not overlap with blocks whose progress has not yet been recorded on
  * disk.  Since partially-copied rows are always read from the old location,
  * we need to stop one row before the sector-wise overlap, to prevent any
  * row-wise overlap. For example, in the diagram above, when we reflow sector
  * B6 it will overwite the original location for B5.
  *
  * To get around this, a scratch space is used so that we can start copying
  * without risking data loss by overlapping the row. As an added benefit, it
  * improves performance at the beginning of the reflow, but that small perf
  * boost wouldn't be worth the complexity on its own.
  *
  * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
  * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
  * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
  * the widths will likely be single digits so we can get a substantial chuck
  * size using only a few MB of scratch per disk.
  *
  * The scratch area is persisted to disk which holds a large amount of reflowed
  * state. We can always read the partially written stripes when a disk fails or
  * the copy is interrupted (crash) during the initial copying phase and also
  * get past a small chunk size restriction.  At a minimum, the scratch space
  * must be large enough to get us to the point that one row does not overlap
  * itself when moved (i.e new_width^2).  But going larger is even better. We
  * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
  * as our scratch space to handle overwriting the initial part of the VDEV.
  *
  *	0     256K   512K                    4M
  *	+------+------+-----------------------+-----------------------------
  *	| VDEV | VDEV |   Boot Block (3.5M)   |  Allocatable space ...
  *	|  L0  |  L1  |       Reserved        |     (Metaslabs)
  *	+------+------+-----------------------+-------------------------------
  *                        Scratch Area
  *
  * == Reflow Progress Updates ==
  * After the initial scratch-based reflow, the expansion process works
  * similarly to device removal. We create a new open context thread which
  * reflows the data, and periodically kicks off sync tasks to update logical
  * state. In this case, state is the committed progress (offset of next data
  * to copy). We need to persist the completed offset on disk, so that if we
  * crash we know which format each VDEV offset is in.
  *
  * == Time Dependent Geometry ==
  *
  * In non-expanded RAIDZ, blocks are read from disk in a column by column
  * fashion. For a multi-row block, the second sector is in the first column
  * not in the second column. This allows us to issue full reads for each
  * column directly into the request buffer. The block data is thus laid out
  * sequentially in a column-by-column fashion.
  *
  * For example, in the before expansion diagram above, one logical block might
  * be sectors G19-H26. The parity is in G19,H23; and the data is in
  * G20,H24,G21,H25,G22,H26.
  *
  * After a block is reflowed, the sectors that were all in the original column
  * data can now reside in different columns. When reading from an expanded
  * VDEV, we need to know the logical stripe width for each block so we can
  * reconstitute the block’s data after the reads are completed. Likewise,
  * when we perform the combinatorial reconstruction we need to know the
  * original width so we can retry combinations from the past layouts.
  *
  * Time dependent geometry is what we call having blocks with different layouts
  * (stripe widths) in the same VDEV. This time-dependent geometry uses the
  * block’s birth time (+ the time expansion ended) to establish the correct
  * width for a given block. After an expansion completes, we record the time
  * for blocks written with a particular width (geometry).
  *
  * == On Disk Format Changes ==
  *
  * New pool feature flag, 'raidz_expansion' whose reference count is the number
  * of RAIDZ VDEVs that have been expanded.
  *
  * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
  *
  * Since the uberblock can point to arbitrary blocks, which might be on the
  * expanding RAIDZ, and might or might not have been expanded. We need to know
  * which way a block is laid out before reading it. This info is the next
  * offset that needs to be reflowed and we persist that in the uberblock, in
  * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
  * After the expansion is complete, we then use the raidz_expand_txgs array
  * (see below) to determine how to read a block and the ub_raidz_reflow_info
  * field no longer required.
  *
  * The uberblock's ub_raidz_reflow_info field also holds the scratch space
  * state (i.e., active or not) which is also required before reading a block
  * during the initial phase of reflowing the data.
  *
  * The top-level RAIDZ VDEV has two new entries in the nvlist:
  *
  * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
  *                            and used after the expansion is complete to
  *                            determine how to read a raidz block
  * 'raidz_expanding' boolean: present during reflow and removed after completion
  *                            used during a spa import to resume an unfinished
  *                            expansion
  *
  * And finally the VDEVs top zap adds the following informational entries:
  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
  */
 
 /*
  * For testing only: pause the raidz expansion after reflowing this amount.
  * (accessed by ZTS and ztest)
  */
 #ifdef	_KERNEL
 static
 #endif	/* _KERNEL */
 unsigned long raidz_expand_max_reflow_bytes = 0;
 
 /*
  * For testing only: pause the raidz expansion at a certain point.
  */
 uint_t raidz_expand_pause_point = 0;
 
 /*
  * Maximum amount of copy io's outstanding at once.
  */
 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
 
 /*
  * Apply raidz map abds aggregation if the number of rows in the map is equal
  * or greater than the value below.
  */
 static unsigned long raidz_io_aggregate_rows = 4;
 
 /*
  * Automatically start a pool scrub when a RAIDZ expansion completes in
  * order to verify the checksums of all blocks which have been copied
  * during the expansion.  Automatic scrubbing is enabled by default and
  * is strongly recommended.
  */
 static int zfs_scrub_after_expand = 1;
 
 static void
 vdev_raidz_row_free(raidz_row_t *rr)
 {
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_size != 0)
 			abd_free(rc->rc_abd);
 		if (rc->rc_orig_data != NULL)
 			abd_free(rc->rc_orig_data);
 	}
 
 	if (rr->rr_abd_empty != NULL)
 		abd_free(rr->rr_abd_empty);
 
 	kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
 }
 
 void
 vdev_raidz_map_free(raidz_map_t *rm)
 {
 	for (int i = 0; i < rm->rm_nrows; i++)
 		vdev_raidz_row_free(rm->rm_row[i]);
 
 	if (rm->rm_nphys_cols) {
 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
 			if (rm->rm_phys_col[i].rc_abd != NULL)
 				abd_free(rm->rm_phys_col[i].rc_abd);
 		}
 
 		kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
 		    rm->rm_nphys_cols);
 	}
 
 	ASSERT3P(rm->rm_lr, ==, NULL);
 	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
 }
 
 static void
 vdev_raidz_map_free_vsd(zio_t *zio)
 {
 	raidz_map_t *rm = zio->io_vsd;
 
 	vdev_raidz_map_free(rm);
 }
 
 static int
 vdev_raidz_reflow_compare(const void *x1, const void *x2)
 {
 	const reflow_node_t *l = x1;
 	const reflow_node_t *r = x2;
 
 	return (TREE_CMP(l->re_txg, r->re_txg));
 }
 
 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 	.vsd_free = vdev_raidz_map_free_vsd,
 };
 
 raidz_row_t *
 vdev_raidz_row_alloc(int cols)
 {
 	raidz_row_t *rr =
 	    kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
 
 	rr->rr_cols = cols;
 	rr->rr_scols = cols;
 
 	for (int c = 0; c < cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		rc->rc_shadow_devidx = INT_MAX;
 		rc->rc_shadow_offset = UINT64_MAX;
 		rc->rc_allow_repair = 1;
 	}
 	return (rr);
 }
 
 static void
 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
 {
 	int c;
 	int nwrapped = 0;
 	uint64_t off = 0;
 	raidz_row_t *rr = rm->rm_row[0];
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 	ASSERT3U(rm->rm_nrows, ==, 1);
 
 	/*
 	 * Pad any parity columns with additional space to account for skip
 	 * sectors.
 	 */
 	if (rm->rm_skipstart < rr->rr_firstdatacol) {
 		ASSERT0(rm->rm_skipstart);
 		nwrapped = rm->rm_nskip;
 	} else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
 		nwrapped =
 		    (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
 	}
 
 	/*
 	 * Optional single skip sectors (rc_size == 0) will be handled in
 	 * vdev_raidz_io_start_write().
 	 */
 	int skipped = rr->rr_scols - rr->rr_cols;
 
 	/* Allocate buffers for the parity columns */
 	for (c = 0; c < rr->rr_firstdatacol; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		/*
 		 * Parity columns will pad out a linear ABD to account for
 		 * the skip sector. A linear ABD is used here because
 		 * parity calculations use the ABD buffer directly to calculate
 		 * parity. This avoids doing a memcpy back to the ABD after the
 		 * parity has been calculated. By issuing the parity column
 		 * with the skip sector we can reduce contention on the child
 		 * VDEV queue locks (vq_lock).
 		 */
 		if (c < nwrapped) {
 			rc->rc_abd = abd_alloc_linear(
 			    rc->rc_size + (1ULL << ashift), B_FALSE);
 			abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
 			skipped++;
 		} else {
 			rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
 		}
 	}
 
 	for (off = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
 		    zio->io_abd, off, rc->rc_size);
 
 		/*
 		 * Generate I/O for skip sectors to improve aggregation
 		 * continuity. We will use gang ABD's to reduce contention
 		 * on the child VDEV queue locks (vq_lock) by issuing
 		 * a single I/O that contains the data and skip sector.
 		 *
 		 * It is important to make sure that rc_size is not updated
 		 * even though we are adding a skip sector to the ABD. When
 		 * calculating the parity in vdev_raidz_generate_parity_row()
 		 * the rc_size is used to iterate through the ABD's. We can
 		 * not have zero'd out skip sectors used for calculating
 		 * parity for raidz, because those same sectors are not used
 		 * during reconstruction.
 		 */
 		if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
 			rc->rc_abd = abd_alloc_gang();
 			abd_gang_add(rc->rc_abd, abd, B_TRUE);
 			abd_gang_add(rc->rc_abd,
 			    abd_get_zeros(1ULL << ashift), B_TRUE);
 			skipped++;
 		} else {
 			rc->rc_abd = abd;
 		}
 		off += rc->rc_size;
 	}
 
 	ASSERT3U(off, ==, zio->io_size);
 	ASSERT3S(skipped, ==, rm->rm_nskip);
 }
 
 static void
 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
 {
 	int c;
 	raidz_row_t *rr = rm->rm_row[0];
 
 	ASSERT3U(rm->rm_nrows, ==, 1);
 
 	/* Allocate buffers for the parity columns */
 	for (c = 0; c < rr->rr_firstdatacol; c++)
 		rr->rr_col[c].rc_abd =
 		    abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
 
 	for (uint64_t off = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
 		    zio->io_abd, off, rc->rc_size);
 		off += rc->rc_size;
 	}
 }
 
 /*
  * Divides the IO evenly across all child vdevs; usually, dcols is
  * the number of children in the target vdev.
  *
  * Avoid inlining the function to keep vdev_raidz_io_start(), which
  * is this functions only caller, as small as possible on the stack.
  */
 noinline raidz_map_t *
 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
     uint64_t nparity)
 {
 	raidz_row_t *rr;
 	/* The starting RAIDZ (parent) vdev sector of the block. */
 	uint64_t b = zio->io_offset >> ashift;
 	/* The zio's size in units of the vdev's minimum sector size. */
 	uint64_t s = zio->io_size >> ashift;
 	/* The first column for this stripe. */
 	uint64_t f = b % dcols;
 	/* The starting byte offset on each child vdev. */
 	uint64_t o = (b / dcols) << ashift;
 	uint64_t acols, scols;
 
 	raidz_map_t *rm =
 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
 	rm->rm_nrows = 1;
 
 	/*
 	 * "Quotient": The number of data sectors for this stripe on all but
 	 * the "big column" child vdevs that also contain "remainder" data.
 	 */
 	uint64_t q = s / (dcols - nparity);
 
 	/*
 	 * "Remainder": The number of partial stripe data sectors in this I/O.
 	 * This will add a sector to some, but not all, child vdevs.
 	 */
 	uint64_t r = s - q * (dcols - nparity);
 
 	/* The number of "big columns" - those which contain remainder data. */
 	uint64_t bc = (r == 0 ? 0 : r + nparity);
 
 	/*
 	 * The total number of data and parity sectors associated with
 	 * this I/O.
 	 */
 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
 
 	/*
 	 * acols: The columns that will be accessed.
 	 * scols: The columns that will be accessed or skipped.
 	 */
 	if (q == 0) {
 		/* Our I/O request doesn't span all child vdevs. */
 		acols = bc;
 		scols = MIN(dcols, roundup(bc, nparity + 1));
 	} else {
 		acols = dcols;
 		scols = dcols;
 	}
 
 	ASSERT3U(acols, <=, scols);
 	rr = vdev_raidz_row_alloc(scols);
 	rm->rm_row[0] = rr;
 	rr->rr_cols = acols;
 	rr->rr_bigcols = bc;
 	rr->rr_firstdatacol = nparity;
 #ifdef ZFS_DEBUG
 	rr->rr_offset = zio->io_offset;
 	rr->rr_size = zio->io_size;
 #endif
 
 	uint64_t asize = 0;
 
 	for (uint64_t c = 0; c < scols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		uint64_t col = f + c;
 		uint64_t coff = o;
 		if (col >= dcols) {
 			col -= dcols;
 			coff += 1ULL << ashift;
 		}
 		rc->rc_devidx = col;
 		rc->rc_offset = coff;
 
 		if (c >= acols)
 			rc->rc_size = 0;
 		else if (c < bc)
 			rc->rc_size = (q + 1) << ashift;
 		else
 			rc->rc_size = q << ashift;
 
 		asize += rc->rc_size;
 	}
 
 	ASSERT3U(asize, ==, tot << ashift);
 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 	rm->rm_skipstart = bc;
 
 	/*
 	 * If all data stored spans all columns, there's a danger that parity
 	 * will always be on the same device and, since parity isn't read
 	 * during normal operation, that device's I/O bandwidth won't be
 	 * used effectively. We therefore switch the parity every 1MB.
 	 *
 	 * ... at least that was, ostensibly, the theory. As a practical
 	 * matter unless we juggle the parity between all devices evenly, we
 	 * won't see any benefit. Further, occasional writes that aren't a
 	 * multiple of the LCM of the number of children and the minimum
 	 * stripe width are sufficient to avoid pessimal behavior.
 	 * Unfortunately, this decision created an implicit on-disk format
 	 * requirement that we need to support for all eternity, but only
 	 * for single-parity RAID-Z.
 	 *
 	 * If we intend to skip a sector in the zeroth column for padding
 	 * we must make sure to note this swap. We will never intend to
 	 * skip the first column since at least one data and one parity
 	 * column must appear in each row.
 	 */
 	ASSERT(rr->rr_cols >= 2);
 	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
 
 	if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
 		uint64_t devidx = rr->rr_col[0].rc_devidx;
 		o = rr->rr_col[0].rc_offset;
 		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
 		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
 		rr->rr_col[1].rc_devidx = devidx;
 		rr->rr_col[1].rc_offset = o;
 		if (rm->rm_skipstart == 0)
 			rm->rm_skipstart = 1;
 	}
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		vdev_raidz_map_alloc_write(zio, rm, ashift);
 	} else {
 		vdev_raidz_map_alloc_read(zio, rm);
 	}
 	/* init RAIDZ parity ops */
 	rm->rm_ops = vdev_raidz_math_get_ops();
 
 	return (rm);
 }
 
 /*
  * Everything before reflow_offset_synced should have been moved to the new
  * location (read and write completed).  However, this may not yet be reflected
  * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
  * uberblock has not yet been written). If reflow is not in progress,
  * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
  * entirely before reflow_offset_synced, it will come from the new location.
  * Otherwise this row will come from the old location.  Therefore, rows that
  * straddle the reflow_offset_synced will come from the old location.
  *
  * For writes, reflow_offset_next is the next offset to copy.  If a sector has
  * been copied, but not yet reflected in the on-disk progress
  * (reflow_offset_synced), it will also be written to the new (already copied)
  * offset.
  */
 noinline raidz_map_t *
 vdev_raidz_map_alloc_expanded(zio_t *zio,
     uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
     uint64_t nparity, uint64_t reflow_offset_synced,
     uint64_t reflow_offset_next, boolean_t use_scratch)
 {
 	abd_t *abd = zio->io_abd;
 	uint64_t offset = zio->io_offset;
 	uint64_t size = zio->io_size;
 
 	/* The zio's size in units of the vdev's minimum sector size. */
 	uint64_t s = size >> ashift;
 
 	/*
 	 * "Quotient": The number of data sectors for this stripe on all but
 	 * the "big column" child vdevs that also contain "remainder" data.
 	 * AKA "full rows"
 	 */
 	uint64_t q = s / (logical_cols - nparity);
 
 	/*
 	 * "Remainder": The number of partial stripe data sectors in this I/O.
 	 * This will add a sector to some, but not all, child vdevs.
 	 */
 	uint64_t r = s - q * (logical_cols - nparity);
 
 	/* The number of "big columns" - those which contain remainder data. */
 	uint64_t bc = (r == 0 ? 0 : r + nparity);
 
 	/*
 	 * The total number of data and parity sectors associated with
 	 * this I/O.
 	 */
 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
 
 	/* How many rows contain data (not skip) */
 	uint64_t rows = howmany(tot, logical_cols);
 	int cols = MIN(tot, logical_cols);
 
 	raidz_map_t *rm =
 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
 	    KM_SLEEP);
 	rm->rm_nrows = rows;
 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 	rm->rm_skipstart = bc;
 	uint64_t asize = 0;
 
 	for (uint64_t row = 0; row < rows; row++) {
 		boolean_t row_use_scratch = B_FALSE;
 		raidz_row_t *rr = vdev_raidz_row_alloc(cols);
 		rm->rm_row[row] = rr;
 
 		/* The starting RAIDZ (parent) vdev sector of the row. */
 		uint64_t b = (offset >> ashift) + row * logical_cols;
 
 		/*
 		 * If we are in the middle of a reflow, and the copying has
 		 * not yet completed for any part of this row, then use the
 		 * old location of this row.  Note that reflow_offset_synced
 		 * reflects the i/o that's been completed, because it's
 		 * updated by a synctask, after zio_wait(spa_txg_zio[]).
 		 * This is sufficient for our check, even if that progress
 		 * has not yet been recorded to disk (reflected in
 		 * spa_ubsync).  Also note that we consider the last row to
 		 * be "full width" (`cols`-wide rather than `bc`-wide) for
 		 * this calculation. This causes a tiny bit of unnecessary
 		 * double-writes but is safe and simpler to calculate.
 		 */
 		int row_phys_cols = physical_cols;
 		if (b + cols > reflow_offset_synced >> ashift)
 			row_phys_cols--;
 		else if (use_scratch)
 			row_use_scratch = B_TRUE;
 
 		/* starting child of this row */
 		uint64_t child_id = b % row_phys_cols;
 		/* The starting byte offset on each child vdev. */
 		uint64_t child_offset = (b / row_phys_cols) << ashift;
 
 		/*
 		 * Note, rr_cols is the entire width of the block, even
 		 * if this row is shorter.  This is needed because parity
 		 * generation (for Q and R) needs to know the entire width,
 		 * because it treats the short row as though it was
 		 * full-width (and the "phantom" sectors were zero-filled).
 		 *
 		 * Another approach to this would be to set cols shorter
 		 * (to just the number of columns that we might do i/o to)
 		 * and have another mechanism to tell the parity generation
 		 * about the "entire width".  Reconstruction (at least
 		 * vdev_raidz_reconstruct_general()) would also need to
 		 * know about the "entire width".
 		 */
 		rr->rr_firstdatacol = nparity;
 #ifdef ZFS_DEBUG
 		/*
 		 * note: rr_size is PSIZE, not ASIZE
 		 */
 		rr->rr_offset = b << ashift;
 		rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
 #endif
 
 		for (int c = 0; c < rr->rr_cols; c++, child_id++) {
 			if (child_id >= row_phys_cols) {
 				child_id -= row_phys_cols;
 				child_offset += 1ULL << ashift;
 			}
 			raidz_col_t *rc = &rr->rr_col[c];
 			rc->rc_devidx = child_id;
 			rc->rc_offset = child_offset;
 
 			/*
 			 * Get this from the scratch space if appropriate.
 			 * This only happens if we crashed in the middle of
 			 * raidz_reflow_scratch_sync() (while it's running,
 			 * the rangelock prevents us from doing concurrent
 			 * io), and even then only during zpool import or
 			 * when the pool is imported readonly.
 			 */
 			if (row_use_scratch)
 				rc->rc_offset -= VDEV_BOOT_SIZE;
 
 			uint64_t dc = c - rr->rr_firstdatacol;
 			if (c < rr->rr_firstdatacol) {
 				rc->rc_size = 1ULL << ashift;
 
 				/*
 				 * Parity sectors' rc_abd's are set below
 				 * after determining if this is an aggregation.
 				 */
 			} else if (row == rows - 1 && bc != 0 && c >= bc) {
 				/*
 				 * Past the end of the block (even including
 				 * skip sectors).  This sector is part of the
 				 * map so that we have full rows for p/q parity
 				 * generation.
 				 */
 				rc->rc_size = 0;
 				rc->rc_abd = NULL;
 			} else {
 				/* "data column" (col excluding parity) */
 				uint64_t off;
 
 				if (c < bc || r == 0) {
 					off = dc * rows + row;
 				} else {
 					off = r * rows +
 					    (dc - r) * (rows - 1) + row;
 				}
 				rc->rc_size = 1ULL << ashift;
 				rc->rc_abd = abd_get_offset_struct(
 				    &rc->rc_abdstruct, abd, off << ashift,
 				    rc->rc_size);
 			}
 
 			if (rc->rc_size == 0)
 				continue;
 
 			/*
 			 * If any part of this row is in both old and new
 			 * locations, the primary location is the old
 			 * location. If this sector was already copied to the
 			 * new location, we need to also write to the new,
 			 * "shadow" location.
 			 *
 			 * Note, `row_phys_cols != physical_cols` indicates
 			 * that the primary location is the old location.
 			 * `b+c < reflow_offset_next` indicates that the copy
 			 * to the new location has been initiated. We know
 			 * that the copy has completed because we have the
 			 * rangelock, which is held exclusively while the
 			 * copy is in progress.
 			 */
 			if (row_use_scratch ||
 			    (row_phys_cols != physical_cols &&
 			    b + c < reflow_offset_next >> ashift)) {
 				rc->rc_shadow_devidx = (b + c) % physical_cols;
 				rc->rc_shadow_offset =
 				    ((b + c) / physical_cols) << ashift;
 				if (row_use_scratch)
 					rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
 			}
 
 			asize += rc->rc_size;
 		}
 
 		/*
 		 * See comment in vdev_raidz_map_alloc()
 		 */
 		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
 		    (offset & (1ULL << 20))) {
 			ASSERT(rr->rr_cols >= 2);
 			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
 
 			int devidx0 = rr->rr_col[0].rc_devidx;
 			uint64_t offset0 = rr->rr_col[0].rc_offset;
 			int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
 			uint64_t shadow_offset0 =
 			    rr->rr_col[0].rc_shadow_offset;
 
 			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
 			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
 			rr->rr_col[0].rc_shadow_devidx =
 			    rr->rr_col[1].rc_shadow_devidx;
 			rr->rr_col[0].rc_shadow_offset =
 			    rr->rr_col[1].rc_shadow_offset;
 
 			rr->rr_col[1].rc_devidx = devidx0;
 			rr->rr_col[1].rc_offset = offset0;
 			rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
 			rr->rr_col[1].rc_shadow_offset = shadow_offset0;
 		}
 	}
 	ASSERT3U(asize, ==, tot << ashift);
 
 	/*
 	 * Determine if the block is contiguous, in which case we can use
 	 * an aggregation.
 	 */
 	if (rows >= raidz_io_aggregate_rows) {
 		rm->rm_nphys_cols = physical_cols;
 		rm->rm_phys_col =
 		    kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
 		    KM_SLEEP);
 
 		/*
 		 * Determine the aggregate io's offset and size, and check
 		 * that the io is contiguous.
 		 */
 		for (int i = 0;
 		    i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			for (int c = 0; c < rr->rr_cols; c++) {
 				raidz_col_t *rc = &rr->rr_col[c];
 				raidz_col_t *prc =
 				    &rm->rm_phys_col[rc->rc_devidx];
 
 				if (rc->rc_size == 0)
 					continue;
 
 				if (prc->rc_size == 0) {
 					ASSERT0(prc->rc_offset);
 					prc->rc_offset = rc->rc_offset;
 				} else if (prc->rc_offset + prc->rc_size !=
 				    rc->rc_offset) {
 					/*
 					 * This block is not contiguous and
 					 * therefore can't be aggregated.
 					 * This is expected to be rare, so
 					 * the cost of allocating and then
 					 * freeing rm_phys_col is not
 					 * significant.
 					 */
 					kmem_free(rm->rm_phys_col,
 					    sizeof (raidz_col_t) *
 					    rm->rm_nphys_cols);
 					rm->rm_phys_col = NULL;
 					rm->rm_nphys_cols = 0;
 					break;
 				}
 				prc->rc_size += rc->rc_size;
 			}
 		}
 	}
 	if (rm->rm_phys_col != NULL) {
 		/*
 		 * Allocate aggregate ABD's.
 		 */
 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
 			raidz_col_t *prc = &rm->rm_phys_col[i];
 
 			prc->rc_devidx = i;
 
 			if (prc->rc_size == 0)
 				continue;
 
 			prc->rc_abd =
 			    abd_alloc_linear(rm->rm_phys_col[i].rc_size,
 			    B_FALSE);
 		}
 
 		/*
 		 * Point the parity abd's into the aggregate abd's.
 		 */
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
 				raidz_col_t *rc = &rr->rr_col[c];
 				raidz_col_t *prc =
 				    &rm->rm_phys_col[rc->rc_devidx];
 				rc->rc_abd =
 				    abd_get_offset_struct(&rc->rc_abdstruct,
 				    prc->rc_abd,
 				    rc->rc_offset - prc->rc_offset,
 				    rc->rc_size);
 			}
 		}
 	} else {
 		/*
 		 * Allocate new abd's for the parity sectors.
 		 */
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
 				raidz_col_t *rc = &rr->rr_col[c];
 				rc->rc_abd =
 				    abd_alloc_linear(rc->rc_size,
 				    B_TRUE);
 			}
 		}
 	}
 	/* init RAIDZ parity ops */
 	rm->rm_ops = vdev_raidz_math_get_ops();
 
 	return (rm);
 }
 
 struct pqr_struct {
 	uint64_t *p;
 	uint64_t *q;
 	uint64_t *r;
 };
 
 static int
 vdev_raidz_p_func(void *buf, size_t size, void *private)
 {
 	struct pqr_struct *pqr = private;
 	const uint64_t *src = buf;
 	int cnt = size / sizeof (src[0]);
 
 	ASSERT(pqr->p && !pqr->q && !pqr->r);
 
 	for (int i = 0; i < cnt; i++, src++, pqr->p++)
 		*pqr->p ^= *src;
 
 	return (0);
 }
 
 static int
 vdev_raidz_pq_func(void *buf, size_t size, void *private)
 {
 	struct pqr_struct *pqr = private;
 	const uint64_t *src = buf;
 	uint64_t mask;
 	int cnt = size / sizeof (src[0]);
 
 	ASSERT(pqr->p && pqr->q && !pqr->r);
 
 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
 		*pqr->p ^= *src;
 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 		*pqr->q ^= *src;
 	}
 
 	return (0);
 }
 
 static int
 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
 {
 	struct pqr_struct *pqr = private;
 	const uint64_t *src = buf;
 	uint64_t mask;
 	int cnt = size / sizeof (src[0]);
 
 	ASSERT(pqr->p && pqr->q && pqr->r);
 
 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
 		*pqr->p ^= *src;
 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 		*pqr->q ^= *src;
 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
 		*pqr->r ^= *src;
 	}
 
 	return (0);
 }
 
 static void
 vdev_raidz_generate_parity_p(raidz_row_t *rr)
 {
 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 
 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		abd_t *src = rr->rr_col[c].rc_abd;
 
 		if (c == rr->rr_firstdatacol) {
 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
 		} else {
 			struct pqr_struct pqr = { p, NULL, NULL };
 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 			    vdev_raidz_p_func, &pqr);
 		}
 	}
 }
 
 static void
 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
 {
 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
 
 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		abd_t *src = rr->rr_col[c].rc_abd;
 
 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
 
 		if (c == rr->rr_firstdatacol) {
 			ASSERT(ccnt == pcnt || ccnt == 0);
 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
 
 			for (uint64_t i = ccnt; i < pcnt; i++) {
 				p[i] = 0;
 				q[i] = 0;
 			}
 		} else {
 			struct pqr_struct pqr = { p, q, NULL };
 
 			ASSERT(ccnt <= pcnt);
 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 			    vdev_raidz_pq_func, &pqr);
 
 			/*
 			 * Treat short columns as though they are full of 0s.
 			 * Note that there's therefore nothing needed for P.
 			 */
 			uint64_t mask;
 			for (uint64_t i = ccnt; i < pcnt; i++) {
 				VDEV_RAIDZ_64MUL_2(q[i], mask);
 			}
 		}
 	}
 }
 
 static void
 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
 {
 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 	uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
 	    rr->rr_col[VDEV_RAIDZ_R].rc_size);
 
 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		abd_t *src = rr->rr_col[c].rc_abd;
 
 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
 
 		if (c == rr->rr_firstdatacol) {
 			ASSERT(ccnt == pcnt || ccnt == 0);
 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
 			(void) memcpy(r, p, rr->rr_col[c].rc_size);
 
 			for (uint64_t i = ccnt; i < pcnt; i++) {
 				p[i] = 0;
 				q[i] = 0;
 				r[i] = 0;
 			}
 		} else {
 			struct pqr_struct pqr = { p, q, r };
 
 			ASSERT(ccnt <= pcnt);
 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 			    vdev_raidz_pqr_func, &pqr);
 
 			/*
 			 * Treat short columns as though they are full of 0s.
 			 * Note that there's therefore nothing needed for P.
 			 */
 			uint64_t mask;
 			for (uint64_t i = ccnt; i < pcnt; i++) {
 				VDEV_RAIDZ_64MUL_2(q[i], mask);
 				VDEV_RAIDZ_64MUL_4(r[i], mask);
 			}
 		}
 	}
 }
 
 /*
  * Generate RAID parity in the first virtual columns according to the number of
  * parity columns available.
  */
 void
 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
 {
 	if (rr->rr_cols == 0) {
 		/*
 		 * We are handling this block one row at a time (because
 		 * this block has a different logical vs physical width,
 		 * due to RAIDZ expansion), and this is a pad-only row,
 		 * which has no parity.
 		 */
 		return;
 	}
 
 	/* Generate using the new math implementation */
 	if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
 		return;
 
 	switch (rr->rr_firstdatacol) {
 	case 1:
 		vdev_raidz_generate_parity_p(rr);
 		break;
 	case 2:
 		vdev_raidz_generate_parity_pq(rr);
 		break;
 	case 3:
 		vdev_raidz_generate_parity_pqr(rr);
 		break;
 	default:
 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 	}
 }
 
 void
 vdev_raidz_generate_parity(raidz_map_t *rm)
 {
 	for (int i = 0; i < rm->rm_nrows; i++) {
 		raidz_row_t *rr = rm->rm_row[i];
 		vdev_raidz_generate_parity_row(rm, rr);
 	}
 }
 
 static int
 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
 {
 	(void) private;
 	uint64_t *dst = dbuf;
 	uint64_t *src = sbuf;
 	int cnt = size / sizeof (src[0]);
 
 	for (int i = 0; i < cnt; i++) {
 		dst[i] ^= src[i];
 	}
 
 	return (0);
 }
 
 static int
 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
     void *private)
 {
 	(void) private;
 	uint64_t *dst = dbuf;
 	uint64_t *src = sbuf;
 	uint64_t mask;
 	int cnt = size / sizeof (dst[0]);
 
 	for (int i = 0; i < cnt; i++, dst++, src++) {
 		VDEV_RAIDZ_64MUL_2(*dst, mask);
 		*dst ^= *src;
 	}
 
 	return (0);
 }
 
 static int
 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
 {
 	(void) private;
 	uint64_t *dst = buf;
 	uint64_t mask;
 	int cnt = size / sizeof (dst[0]);
 
 	for (int i = 0; i < cnt; i++, dst++) {
 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
 		VDEV_RAIDZ_64MUL_2(*dst, mask);
 	}
 
 	return (0);
 }
 
 struct reconst_q_struct {
 	uint64_t *q;
 	int exp;
 };
 
 static int
 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
 {
 	struct reconst_q_struct *rq = private;
 	uint64_t *dst = buf;
 	int cnt = size / sizeof (dst[0]);
 
 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
 		int j;
 		uint8_t *b;
 
 		*dst ^= *rq->q;
 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 			*b = vdev_raidz_exp2(*b, rq->exp);
 		}
 	}
 
 	return (0);
 }
 
 struct reconst_pq_struct {
 	uint8_t *p;
 	uint8_t *q;
 	uint8_t *pxy;
 	uint8_t *qxy;
 	int aexp;
 	int bexp;
 };
 
 static int
 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
 {
 	struct reconst_pq_struct *rpq = private;
 	uint8_t *xd = xbuf;
 	uint8_t *yd = ybuf;
 
 	for (int i = 0; i < size;
 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
 	}
 
 	return (0);
 }
 
 static int
 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
 {
 	struct reconst_pq_struct *rpq = private;
 	uint8_t *xd = xbuf;
 
 	for (int i = 0; i < size;
 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 	}
 
 	return (0);
 }
 
 static void
 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	int x = tgts[0];
 	abd_t *dst, *src;
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
 
 	ASSERT3U(ntgts, ==, 1);
 	ASSERT3U(x, >=, rr->rr_firstdatacol);
 	ASSERT3U(x, <, rr->rr_cols);
 
 	ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
 
 	src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
 	dst = rr->rr_col[x].rc_abd;
 
 	abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
 
 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		uint64_t size = MIN(rr->rr_col[x].rc_size,
 		    rr->rr_col[c].rc_size);
 
 		src = rr->rr_col[c].rc_abd;
 
 		if (c == x)
 			continue;
 
 		(void) abd_iterate_func2(dst, src, 0, 0, size,
 		    vdev_raidz_reconst_p_func, NULL);
 	}
 }
 
 static void
 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	int x = tgts[0];
 	int c, exp;
 	abd_t *dst, *src;
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
 
 	ASSERT(ntgts == 1);
 
 	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
 
 	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
 		    rr->rr_col[c].rc_size);
 
 		src = rr->rr_col[c].rc_abd;
 		dst = rr->rr_col[x].rc_abd;
 
 		if (c == rr->rr_firstdatacol) {
 			abd_copy(dst, src, size);
 			if (rr->rr_col[x].rc_size > size) {
 				abd_zero_off(dst, size,
 				    rr->rr_col[x].rc_size - size);
 			}
 		} else {
 			ASSERT3U(size, <=, rr->rr_col[x].rc_size);
 			(void) abd_iterate_func2(dst, src, 0, 0, size,
 			    vdev_raidz_reconst_q_pre_func, NULL);
 			(void) abd_iterate_func(dst,
 			    size, rr->rr_col[x].rc_size - size,
 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
 		}
 	}
 
 	src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
 	dst = rr->rr_col[x].rc_abd;
 	exp = 255 - (rr->rr_cols - 1 - x);
 
 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
 	(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
 	    vdev_raidz_reconst_q_post_func, &rq);
 }
 
 static void
 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
 	abd_t *pdata, *qdata;
 	uint64_t xsize, ysize;
 	int x = tgts[0];
 	int y = tgts[1];
 	abd_t *xd, *yd;
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
 
 	ASSERT(ntgts == 2);
 	ASSERT(x < y);
 	ASSERT(x >= rr->rr_firstdatacol);
 	ASSERT(y < rr->rr_cols);
 
 	ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
 
 	/*
 	 * Move the parity data aside -- we're going to compute parity as
 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 	 * reuse the parity generation mechanism without trashing the actual
 	 * parity so we make those columns appear to be full of zeros by
 	 * setting their lengths to zero.
 	 */
 	pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
 	qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
 	xsize = rr->rr_col[x].rc_size;
 	ysize = rr->rr_col[y].rc_size;
 
 	rr->rr_col[VDEV_RAIDZ_P].rc_abd =
 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
 	rr->rr_col[x].rc_size = 0;
 	rr->rr_col[y].rc_size = 0;
 
 	vdev_raidz_generate_parity_pq(rr);
 
 	rr->rr_col[x].rc_size = xsize;
 	rr->rr_col[y].rc_size = ysize;
 
 	p = abd_to_buf(pdata);
 	q = abd_to_buf(qdata);
 	pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 	qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 	xd = rr->rr_col[x].rc_abd;
 	yd = rr->rr_col[y].rc_abd;
 
 	/*
 	 * We now have:
 	 *	Pxy = P + D_x + D_y
 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 	 *
 	 * We can then solve for D_x:
 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
 	 * where
 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 	 *
 	 * With D_x in hand, we can easily solve for D_y:
 	 *	D_y = P + Pxy + D_x
 	 */
 
 	a = vdev_raidz_pow2[255 + x - y];
 	b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
 	tmp = 255 - vdev_raidz_log2[a ^ 1];
 
 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 
 	ASSERT3U(xsize, >=, ysize);
 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
 
 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
 	    vdev_raidz_reconst_pq_func, &rpq);
 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
 	    vdev_raidz_reconst_pq_tail_func, &rpq);
 
 	abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 	abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 
 	/*
 	 * Restore the saved parity data.
 	 */
 	rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
 }
 
 /*
  * In the general case of reconstruction, we must solve the system of linear
  * equations defined by the coefficients used to generate parity as well as
  * the contents of the data and parity disks. This can be expressed with
  * vectors for the original data (D) and the actual data (d) and parity (p)
  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
  *
  *            __   __                     __     __
  *            |     |         __     __   |  p_0  |
  *            |  V  |         |  D_0  |   | p_m-1 |
  *            |     |    x    |   :   | = |  d_0  |
  *            |  I  |         | D_n-1 |   |   :   |
  *            |     |         ~~     ~~   | d_n-1 |
  *            ~~   ~~                     ~~     ~~
  *
  * I is simply a square identity matrix of size n, and V is a vandermonde
  * matrix defined by the coefficients we chose for the various parity columns
  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
  * computation as well as linear separability.
  *
  *      __               __               __     __
  *      |   1   ..  1 1 1 |               |  p_0  |
  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
  *      |   :       : : : |   |   :   |   |  d_2  |
  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
  *      |   0   ..  0 0 1 |               | d_n-1 |
  *      ~~               ~~               ~~     ~~
  *
  * Note that I, V, d, and p are known. To compute D, we must invert the
  * matrix and use the known data and parity values to reconstruct the unknown
  * data values. We begin by removing the rows in V|I and d|p that correspond
  * to failed or missing columns; we then make V|I square (n x n) and d|p
  * sized n by removing rows corresponding to unused parity from the bottom up
  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
  * using Gauss-Jordan elimination. In the example below we use m=3 parity
  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
  *           __                               __
  *           |  1   1   1   1   1   1   1   1  |
  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
  *           |  19 205 116  29  64  16  4   1  |      / /
  *           |  1   0   0   0   0   0   0   0  |     / /
  *           |  0   1   0   0   0   0   0   0  | <--' /
  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
  *           |  0   0   0   1   0   0   0   0  |
  *           |  0   0   0   0   1   0   0   0  |
  *           |  0   0   0   0   0   1   0   0  |
  *           |  0   0   0   0   0   0   1   0  |
  *           |  0   0   0   0   0   0   0   1  |
  *           ~~                               ~~
  *           __                               __
  *           |  1   1   1   1   1   1   1   1  |
  *           | 128  64  32  16  8   4   2   1  |
  *           |  19 205 116  29  64  16  4   1  |
  *           |  1   0   0   0   0   0   0   0  |
  *           |  0   1   0   0   0   0   0   0  |
  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
  *           |  0   0   0   1   0   0   0   0  |
  *           |  0   0   0   0   1   0   0   0  |
  *           |  0   0   0   0   0   1   0   0  |
  *           |  0   0   0   0   0   0   1   0  |
  *           |  0   0   0   0   0   0   0   1  |
  *           ~~                               ~~
  *
  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
  * matrix is not singular.
  * __                                                                 __
  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  *                   __                               __
  *                   |  0   0   1   0   0   0   0   0  |
  *                   | 167 100  5   41 159 169 217 208 |
  *                   | 166 100  4   40 158 168 216 209 |
  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
  *                   |  0   0   0   0   1   0   0   0  |
  *                   |  0   0   0   0   0   1   0   0  |
  *                   |  0   0   0   0   0   0   1   0  |
  *                   |  0   0   0   0   0   0   0   1  |
  *                   ~~                               ~~
  *
  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
  * of the missing data.
  *
  * As is apparent from the example above, the only non-trivial rows in the
  * inverse matrix correspond to the data disks that we're trying to
  * reconstruct. Indeed, those are the only rows we need as the others would
  * only be useful for reconstructing data known or assumed to be valid. For
  * that reason, we only build the coefficients in the rows that correspond to
  * targeted columns.
  */
 
 static void
 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
     uint8_t **rows)
 {
 	int i, j;
 	int pow;
 
 	ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
 
 	/*
 	 * Fill in the missing rows of interest.
 	 */
 	for (i = 0; i < nmap; i++) {
 		ASSERT3S(0, <=, map[i]);
 		ASSERT3S(map[i], <=, 2);
 
 		pow = map[i] * n;
 		if (pow > 255)
 			pow -= 255;
 		ASSERT(pow <= 255);
 
 		for (j = 0; j < n; j++) {
 			pow -= map[i];
 			if (pow < 0)
 				pow += 255;
 			rows[i][j] = vdev_raidz_pow2[pow];
 		}
 	}
 }
 
 static void
 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
 {
 	int i, j, ii, jj;
 	uint8_t log;
 
 	/*
 	 * Assert that the first nmissing entries from the array of used
 	 * columns correspond to parity columns and that subsequent entries
 	 * correspond to data columns.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		ASSERT3S(used[i], <, rr->rr_firstdatacol);
 	}
 	for (; i < n; i++) {
 		ASSERT3S(used[i], >=, rr->rr_firstdatacol);
 	}
 
 	/*
 	 * First initialize the storage where we'll compute the inverse rows.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			invrows[i][j] = (i == j) ? 1 : 0;
 		}
 	}
 
 	/*
 	 * Subtract all trivial rows from the rows of consequence.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = nmissing; j < n; j++) {
 			ASSERT3U(used[j], >=, rr->rr_firstdatacol);
 			jj = used[j] - rr->rr_firstdatacol;
 			ASSERT3S(jj, <, n);
 			invrows[i][j] = rows[i][jj];
 			rows[i][jj] = 0;
 		}
 	}
 
 	/*
 	 * For each of the rows of interest, we must normalize it and subtract
 	 * a multiple of it from the other rows.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < missing[i]; j++) {
 			ASSERT0(rows[i][j]);
 		}
 		ASSERT3U(rows[i][missing[i]], !=, 0);
 
 		/*
 		 * Compute the inverse of the first element and multiply each
 		 * element in the row by that value.
 		 */
 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
 
 		for (j = 0; j < n; j++) {
 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
 		}
 
 		for (ii = 0; ii < nmissing; ii++) {
 			if (i == ii)
 				continue;
 
 			ASSERT3U(rows[ii][missing[i]], !=, 0);
 
 			log = vdev_raidz_log2[rows[ii][missing[i]]];
 
 			for (j = 0; j < n; j++) {
 				rows[ii][j] ^=
 				    vdev_raidz_exp2(rows[i][j], log);
 				invrows[ii][j] ^=
 				    vdev_raidz_exp2(invrows[i][j], log);
 			}
 		}
 	}
 
 	/*
 	 * Verify that the data that is left in the rows are properly part of
 	 * an identity matrix.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			if (j == missing[i]) {
 				ASSERT3U(rows[i][j], ==, 1);
 			} else {
 				ASSERT0(rows[i][j]);
 			}
 		}
 	}
 }
 
 static void
 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
     int *missing, uint8_t **invrows, const uint8_t *used)
 {
 	int i, j, x, cc, c;
 	uint8_t *src;
 	uint64_t ccount;
 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
 	uint8_t log = 0;
 	uint8_t val;
 	int ll;
 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *p, *pp;
 	size_t psize;
 
 	psize = sizeof (invlog[0][0]) * n * nmissing;
 	p = kmem_alloc(psize, KM_SLEEP);
 
 	for (pp = p, i = 0; i < nmissing; i++) {
 		invlog[i] = pp;
 		pp += n;
 	}
 
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			ASSERT3U(invrows[i][j], !=, 0);
 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
 		}
 	}
 
 	for (i = 0; i < n; i++) {
 		c = used[i];
 		ASSERT3U(c, <, rr->rr_cols);
 
 		ccount = rr->rr_col[c].rc_size;
 		ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
 		if (ccount == 0)
 			continue;
 		src = abd_to_buf(rr->rr_col[c].rc_abd);
 		for (j = 0; j < nmissing; j++) {
 			cc = missing[j] + rr->rr_firstdatacol;
 			ASSERT3U(cc, >=, rr->rr_firstdatacol);
 			ASSERT3U(cc, <, rr->rr_cols);
 			ASSERT3U(cc, !=, c);
 
 			dcount[j] = rr->rr_col[cc].rc_size;
 			if (dcount[j] != 0)
 				dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
 		}
 
 		for (x = 0; x < ccount; x++, src++) {
 			if (*src != 0)
 				log = vdev_raidz_log2[*src];
 
 			for (cc = 0; cc < nmissing; cc++) {
 				if (x >= dcount[cc])
 					continue;
 
 				if (*src == 0) {
 					val = 0;
 				} else {
 					if ((ll = log + invlog[cc][i]) >= 255)
 						ll -= 255;
 					val = vdev_raidz_pow2[ll];
 				}
 
 				if (i == 0)
 					dst[cc][x] = val;
 				else
 					dst[cc][x] ^= val;
 			}
 		}
 	}
 
 	kmem_free(p, psize);
 }
 
 static void
 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	int n, i, c, t, tt;
 	int nmissing_rows;
 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
 	int parity_map[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *p, *pp;
 	size_t psize;
 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *used;
 
 	abd_t **bufs = NULL;
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
 	/*
 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
 	 * temporary linear ABDs if any non-linear ABDs are found.
 	 */
 	for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
 		ASSERT(rr->rr_col[i].rc_abd != NULL);
 		if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
 			bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
 			    KM_PUSHPAGE);
 
 			for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 				raidz_col_t *col = &rr->rr_col[c];
 
 				bufs[c] = col->rc_abd;
 				if (bufs[c] != NULL) {
 					col->rc_abd = abd_alloc_linear(
 					    col->rc_size, B_TRUE);
 					abd_copy(col->rc_abd, bufs[c],
 					    col->rc_size);
 				}
 			}
 
 			break;
 		}
 	}
 
 	n = rr->rr_cols - rr->rr_firstdatacol;
 
 	/*
 	 * Figure out which data columns are missing.
 	 */
 	nmissing_rows = 0;
 	for (t = 0; t < ntgts; t++) {
 		if (tgts[t] >= rr->rr_firstdatacol) {
 			missing_rows[nmissing_rows++] =
 			    tgts[t] - rr->rr_firstdatacol;
 		}
 	}
 
 	/*
 	 * Figure out which parity columns to use to help generate the missing
 	 * data columns.
 	 */
 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
 		ASSERT(tt < ntgts);
 		ASSERT(c < rr->rr_firstdatacol);
 
 		/*
 		 * Skip any targeted parity columns.
 		 */
 		if (c == tgts[tt]) {
 			tt++;
 			continue;
 		}
 
 		parity_map[i] = c;
 		i++;
 	}
 
 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
 	    nmissing_rows * n + sizeof (used[0]) * n;
 	p = kmem_alloc(psize, KM_SLEEP);
 
 	for (pp = p, i = 0; i < nmissing_rows; i++) {
 		rows[i] = pp;
 		pp += n;
 		invrows[i] = pp;
 		pp += n;
 	}
 	used = pp;
 
 	for (i = 0; i < nmissing_rows; i++) {
 		used[i] = parity_map[i];
 	}
 
 	for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		if (tt < nmissing_rows &&
 		    c == missing_rows[tt] + rr->rr_firstdatacol) {
 			tt++;
 			continue;
 		}
 
 		ASSERT3S(i, <, n);
 		used[i] = c;
 		i++;
 	}
 
 	/*
 	 * Initialize the interesting rows of the matrix.
 	 */
 	vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
 
 	/*
 	 * Invert the matrix.
 	 */
 	vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
 	    invrows, used);
 
 	/*
 	 * Reconstruct the missing data using the generated matrix.
 	 */
 	vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
 	    invrows, used);
 
 	kmem_free(p, psize);
 
 	/*
 	 * copy back from temporary linear abds and free them
 	 */
 	if (bufs) {
 		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 			raidz_col_t *col = &rr->rr_col[c];
 
 			if (bufs[c] != NULL) {
 				abd_copy(bufs[c], col->rc_abd, col->rc_size);
 				abd_free(col->rc_abd);
 			}
 			col->rc_abd = bufs[c];
 		}
 		kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
 	}
 }
 
 static void
 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
     const int *t, int nt)
 {
 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
 	int ntgts;
 	int i, c, ret;
 	int nbadparity, nbaddata;
 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
 		zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
 		    rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
 		    (int)rr->rr_missingparity);
 	}
 
 	nbadparity = rr->rr_firstdatacol;
 	nbaddata = rr->rr_cols - nbadparity;
 	ntgts = 0;
 	for (i = 0, c = 0; c < rr->rr_cols; c++) {
 		if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
 			zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
 			    "offset=%llx error=%u)",
 			    rr, c, (int)rr->rr_col[c].rc_devidx,
 			    (long long)rr->rr_col[c].rc_offset,
 			    (int)rr->rr_col[c].rc_error);
 		}
 		if (c < rr->rr_firstdatacol)
 			parity_valid[c] = B_FALSE;
 
 		if (i < nt && c == t[i]) {
 			tgts[ntgts++] = c;
 			i++;
 		} else if (rr->rr_col[c].rc_error != 0) {
 			tgts[ntgts++] = c;
 		} else if (c >= rr->rr_firstdatacol) {
 			nbaddata--;
 		} else {
 			parity_valid[c] = B_TRUE;
 			nbadparity--;
 		}
 	}
 
 	ASSERT(ntgts >= nt);
 	ASSERT(nbaddata >= 0);
 	ASSERT(nbaddata + nbadparity == ntgts);
 
 	dt = &tgts[nbadparity];
 
 	/* Reconstruct using the new math implementation */
 	ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
 	if (ret != RAIDZ_ORIGINAL_IMPL)
 		return;
 
 	/*
 	 * See if we can use any of our optimized reconstruction routines.
 	 */
 	switch (nbaddata) {
 	case 1:
 		if (parity_valid[VDEV_RAIDZ_P]) {
 			vdev_raidz_reconstruct_p(rr, dt, 1);
 			return;
 		}
 
 		ASSERT(rr->rr_firstdatacol > 1);
 
 		if (parity_valid[VDEV_RAIDZ_Q]) {
 			vdev_raidz_reconstruct_q(rr, dt, 1);
 			return;
 		}
 
 		ASSERT(rr->rr_firstdatacol > 2);
 		break;
 
 	case 2:
 		ASSERT(rr->rr_firstdatacol > 1);
 
 		if (parity_valid[VDEV_RAIDZ_P] &&
 		    parity_valid[VDEV_RAIDZ_Q]) {
 			vdev_raidz_reconstruct_pq(rr, dt, 2);
 			return;
 		}
 
 		ASSERT(rr->rr_firstdatacol > 2);
 
 		break;
 	}
 
 	vdev_raidz_reconstruct_general(rr, tgts, ntgts);
 }
 
 static int
 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	uint64_t nparity = vdrz->vd_nparity;
 	int c;
 	int lasterror = 0;
 	int numerrors = 0;
 
 	ASSERT(nparity > 0);
 
 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
 	    vd->vdev_children < nparity + 1) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	vdev_open_children(vd);
 
 	for (c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error != 0) {
 			lasterror = cvd->vdev_open_error;
 			numerrors++;
 			continue;
 		}
 
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
 	}
 	for (c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error != 0)
 			continue;
 		*physical_ashift = vdev_best_ashift(*logical_ashift,
 		    *physical_ashift, cvd->vdev_physical_ashift);
 	}
 
 	if (vd->vdev_rz_expanding) {
 		*asize *= vd->vdev_children - 1;
 		*max_asize *= vd->vdev_children - 1;
 
 		vd->vdev_min_asize = *asize;
 	} else {
 		*asize *= vd->vdev_children;
 		*max_asize *= vd->vdev_children;
 	}
 
 	if (numerrors > nparity) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 		return (lasterror);
 	}
 
 	return (0);
 }
 
 static void
 vdev_raidz_close(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c] != NULL)
 			vdev_close(vd->vdev_child[c]);
 	}
 }
 
 /*
  * Return the logical width to use, given the txg in which the allocation
- * happened.  Note that BP_PHYSICAL_BIRTH() is usually the txg in which the
+ * happened.  Note that BP_GET_BIRTH() is usually the txg in which the
  * BP was allocated.  Remapped BP's (that were relocated due to device
- * removal, see remap_blkptr_cb()), will have a more recent
- * BP_PHYSICAL_BIRTH() which reflects when the BP was relocated, but we can
- * ignore these because they can't be on RAIDZ (device removal doesn't
- * support RAIDZ).
+ * removal, see remap_blkptr_cb()), will have a more recent physical birth
+ * which reflects when the BP was relocated, but we can ignore these because
+ * they can't be on RAIDZ (device removal doesn't support RAIDZ).
  */
 static uint64_t
 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
 {
 	reflow_node_t lookup = {
 		.re_txg = txg,
 	};
 	avl_index_t where;
 
 	uint64_t width;
 	mutex_enter(&vdrz->vd_expand_lock);
 	reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
 	if (re != NULL) {
 		width = re->re_logical_width;
 	} else {
 		re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
 		if (re != NULL)
 			width = re->re_logical_width;
 		else
 			width = vdrz->vd_original_width;
 	}
 	mutex_exit(&vdrz->vd_expand_lock);
 	return (width);
 }
 
 /*
  * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
  * more space due to the lower data-to-parity ratio.  In this case it's
  * important to pass in the correct txg.  Note that vdev_gang_header_asize()
  * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
  * regardless of txg.  This is assured because for a single data sector, we
  * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
  */
 static uint64_t
 vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	uint64_t asize;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	uint64_t cols = vdrz->vd_original_width;
 	uint64_t nparity = vdrz->vd_nparity;
 
 	cols = vdev_raidz_get_logical_width(vdrz, txg);
 
 	asize = ((psize - 1) >> ashift) + 1;
 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
 	asize = roundup(asize, nparity + 1) << ashift;
 
 #ifdef ZFS_DEBUG
 	uint64_t asize_new = ((psize - 1) >> ashift) + 1;
 	uint64_t ncols_new = vdrz->vd_physical_width;
 	asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
 	    (ncols_new - nparity));
 	asize_new = roundup(asize_new, nparity + 1) << ashift;
 	VERIFY3U(asize_new, <=, asize);
 #endif
 
 	return (asize);
 }
 
 /*
  * The allocatable space for a raidz vdev is N * sizeof(smallest child)
  * so each child must provide at least 1/Nth of its asize.
  */
 static uint64_t
 vdev_raidz_min_asize(vdev_t *vd)
 {
 	return ((vd->vdev_min_asize + vd->vdev_children - 1) /
 	    vd->vdev_children);
 }
 
 void
 vdev_raidz_child_done(zio_t *zio)
 {
 	raidz_col_t *rc = zio->io_private;
 
 	ASSERT3P(rc->rc_abd, !=, NULL);
 	rc->rc_error = zio->io_error;
 	rc->rc_tried = 1;
 	rc->rc_skipped = 0;
 }
 
 static void
 vdev_raidz_shadow_child_done(zio_t *zio)
 {
 	raidz_col_t *rc = zio->io_private;
 
 	rc->rc_shadow_error = zio->io_error;
 }
 
 static void
 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
 {
 	(void) rm;
 #ifdef ZFS_DEBUG
 	range_seg64_t logical_rs, physical_rs, remain_rs;
 	logical_rs.rs_start = rr->rr_offset;
 	logical_rs.rs_end = logical_rs.rs_start +
 	    vdev_raidz_asize(zio->io_vd, rr->rr_size,
-	    BP_PHYSICAL_BIRTH(zio->io_bp));
+	    BP_GET_BIRTH(zio->io_bp));
 
 	raidz_col_t *rc = &rr->rr_col[col];
 	vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
 
 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
 	ASSERT(vdev_xlate_is_empty(&remain_rs));
 	if (vdev_xlate_is_empty(&physical_rs)) {
 		/*
 		 * If we are in the middle of expansion, the
 		 * physical->logical mapping is changing so vdev_xlate()
 		 * can't give us a reliable answer.
 		 */
 		return;
 	}
 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
 	/*
 	 * It would be nice to assert that rs_end is equal
 	 * to rc_offset + rc_size but there might be an
 	 * optional I/O at the end that is not accounted in
 	 * rc_size.
 	 */
 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
 		    rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
 	} else {
 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
 	}
 #endif
 }
 
 static void
 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
 {
 	vdev_t *vd = zio->io_vd;
 	raidz_map_t *rm = zio->io_vsd;
 
 	vdev_raidz_generate_parity_row(rm, rr);
 
 	for (int c = 0; c < rr->rr_scols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 		/* Verify physical to logical translation */
 		vdev_raidz_io_verify(zio, rm, rr, c);
 
 		if (rc->rc_size == 0)
 			continue;
 
 		ASSERT3U(rc->rc_offset + rc->rc_size, <,
 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
 
 		ASSERT3P(rc->rc_abd, !=, NULL);
 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 		    rc->rc_offset, rc->rc_abd,
 		    abd_get_size(rc->rc_abd), zio->io_type,
 		    zio->io_priority, 0, vdev_raidz_child_done, rc));
 
 		if (rc->rc_shadow_devidx != INT_MAX) {
 			vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
 
 			ASSERT3U(
 			    rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
 			    cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
 
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
 			    rc->rc_shadow_offset, rc->rc_abd,
 			    abd_get_size(rc->rc_abd),
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_shadow_child_done, rc));
 		}
 	}
 }
 
 /*
  * Generate optional I/Os for skip sectors to improve aggregation contiguity.
  * This only works for vdev_raidz_map_alloc() (not _expanded()).
  */
 static void
 raidz_start_skip_writes(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	raidz_map_t *rm = zio->io_vsd;
 	ASSERT3U(rm->rm_nrows, ==, 1);
 	raidz_row_t *rr = rm->rm_row[0];
 	for (int c = 0; c < rr->rr_scols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 		if (rc->rc_size != 0)
 			continue;
 		ASSERT3P(rc->rc_abd, ==, NULL);
 
 		ASSERT3U(rc->rc_offset, <,
 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
 
 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
 		    NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
 		    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
 	}
 }
 
 static void
 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
 {
 	vdev_t *vd = zio->io_vd;
 
 	/*
 	 * Iterate over the columns in reverse order so that we hit the parity
 	 * last -- any errors along the way will force us to read the parity.
 	 */
 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		if (rc->rc_size == 0)
 			continue;
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 		if (!vdev_readable(cvd)) {
 			if (c >= rr->rr_firstdatacol)
 				rr->rr_missingdata++;
 			else
 				rr->rr_missingparity++;
 			rc->rc_error = SET_ERROR(ENXIO);
 			rc->rc_tried = 1;	/* don't even try */
 			rc->rc_skipped = 1;
 			continue;
 		}
 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
 			if (c >= rr->rr_firstdatacol)
 				rr->rr_missingdata++;
 			else
 				rr->rr_missingparity++;
 			rc->rc_error = SET_ERROR(ESTALE);
 			rc->rc_skipped = 1;
 			continue;
 		}
 		if (forceparity ||
 		    c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
 	}
 }
 
 static void
 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
 {
 	vdev_t *vd = zio->io_vd;
 
 	for (int i = 0; i < rm->rm_nphys_cols; i++) {
 		raidz_col_t *prc = &rm->rm_phys_col[i];
 		if (prc->rc_size == 0)
 			continue;
 
 		ASSERT3U(prc->rc_devidx, ==, i);
 		vdev_t *cvd = vd->vdev_child[i];
 		if (!vdev_readable(cvd)) {
 			prc->rc_error = SET_ERROR(ENXIO);
 			prc->rc_tried = 1;	/* don't even try */
 			prc->rc_skipped = 1;
 			continue;
 		}
 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
 			prc->rc_error = SET_ERROR(ESTALE);
 			prc->rc_skipped = 1;
 			continue;
 		}
 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 		    prc->rc_offset, prc->rc_abd, prc->rc_size,
 		    zio->io_type, zio->io_priority, 0,
 		    vdev_raidz_child_done, prc));
 	}
 }
 
 static void
 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
 {
 	/*
 	 * If there are multiple rows, we will be hitting
 	 * all disks, so go ahead and read the parity so
 	 * that we are reading in decent size chunks.
 	 */
 	boolean_t forceparity = rm->rm_nrows > 1;
 
 	if (rm->rm_phys_col) {
 		vdev_raidz_io_start_read_phys_cols(zio, rm);
 	} else {
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			vdev_raidz_io_start_read_row(zio, rr, forceparity);
 		}
 	}
 }
 
 /*
  * Start an IO operation on a RAIDZ VDev
  *
  * Outline:
  * - For write operations:
  *   1. Generate the parity data
  *   2. Create child zio write operations to each column's vdev, for both
  *      data and parity.
  *   3. If the column skips any sectors for padding, create optional dummy
  *      write zio children for those areas to improve aggregation continuity.
  * - For read operations:
  *   1. Create child zio read operations to each data column's vdev to read
  *      the range of data required for zio.
  *   2. If this is a scrub or resilver operation, or if any of the data
  *      vdevs have had errors, then create zio read operations to the parity
  *      columns' VDevs as well.
  */
 static void
 vdev_raidz_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_t *tvd = vd->vdev_top;
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	raidz_map_t *rm;
 
 	uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
-	    BP_PHYSICAL_BIRTH(zio->io_bp));
+	    BP_GET_BIRTH(zio->io_bp));
 	if (logical_width != vdrz->vd_physical_width) {
 		zfs_locked_range_t *lr = NULL;
 		uint64_t synced_offset = UINT64_MAX;
 		uint64_t next_offset = UINT64_MAX;
 		boolean_t use_scratch = B_FALSE;
 		/*
 		 * Note: when the expansion is completing, we set
 		 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
 		 * in a later txg than when we last update spa_ubsync's state
 		 * (see the end of spa_raidz_expand_thread()).  Therefore we
 		 * may see vre_state!=SCANNING before
 		 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
 		 * on disk, but the copying progress has been synced to disk
 		 * (and reflected in spa_ubsync).  In this case it's fine to
 		 * treat the expansion as completed, since if we crash there's
 		 * no additional copying to do.
 		 */
 		if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
 			ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
 			    &vdrz->vn_vre);
 			lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
 			    zio->io_offset, zio->io_size, RL_READER);
 			use_scratch =
 			    (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
 			    RRSS_SCRATCH_VALID);
 			synced_offset =
 			    RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
 			next_offset = vdrz->vn_vre.vre_offset;
 			/*
 			 * If we haven't resumed expanding since importing the
 			 * pool, vre_offset won't have been set yet.  In
 			 * this case the next offset to be copied is the same
 			 * as what was synced.
 			 */
 			if (next_offset == UINT64_MAX) {
 				next_offset = synced_offset;
 			}
 		}
 		if (use_scratch) {
 			zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
 			    "%lld next_offset=%lld use_scratch=%u",
 			    zio,
 			    zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
 			    (long long)zio->io_offset,
 			    (long long)synced_offset,
 			    (long long)next_offset,
 			    use_scratch);
 		}
 
 		rm = vdev_raidz_map_alloc_expanded(zio,
 		    tvd->vdev_ashift, vdrz->vd_physical_width,
 		    logical_width, vdrz->vd_nparity,
 		    synced_offset, next_offset, use_scratch);
 		rm->rm_lr = lr;
 	} else {
 		rm = vdev_raidz_map_alloc(zio,
 		    tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
 	}
 	rm->rm_original_width = vdrz->vd_original_width;
 
 	zio->io_vsd = rm;
 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			vdev_raidz_io_start_write(zio, rm->rm_row[i]);
 		}
 
 		if (logical_width == vdrz->vd_physical_width) {
 			raidz_start_skip_writes(zio);
 		}
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_READ);
 		vdev_raidz_io_start_read(zio, rm);
 	}
 
 	zio_execute(zio);
 }
 
 /*
  * Report a checksum error for a child of a RAID-Z device.
  */
 void
 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
 {
 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
 
 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
 	    zio->io_priority != ZIO_PRIORITY_REBUILD) {
 		zio_bad_cksum_t zbc;
 		raidz_map_t *rm = zio->io_vsd;
 
 		zbc.zbc_has_cksum = 0;
 		zbc.zbc_injected = rm->rm_ecksuminjected;
 
 		mutex_enter(&vd->vdev_stat_lock);
 		vd->vdev_stat.vs_checksum_errors++;
 		mutex_exit(&vd->vdev_stat_lock);
 		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
 		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
 		    rc->rc_abd, bad_data, &zbc);
 	}
 }
 
 /*
  * We keep track of whether or not there were any injected errors, so that
  * any ereports we generate can note it.
  */
 static int
 raidz_checksum_verify(zio_t *zio)
 {
 	zio_bad_cksum_t zbc = {0};
 	raidz_map_t *rm = zio->io_vsd;
 
 	int ret = zio_checksum_error(zio, &zbc);
 	if (ret != 0 && zbc.zbc_injected != 0)
 		rm->rm_ecksuminjected = 1;
 
 	return (ret);
 }
 
 /*
  * Generate the parity from the data columns. If we tried and were able to
  * read the parity without error, verify that the generated parity matches the
  * data we read. If it doesn't, we fire off a checksum error. Return the
  * number of such failures.
  */
 static int
 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
 {
 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
 	int c, ret = 0;
 	raidz_map_t *rm = zio->io_vsd;
 	raidz_col_t *rc;
 
 	blkptr_t *bp = zio->io_bp;
 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
 
 	if (checksum == ZIO_CHECKSUM_NOPARITY)
 		return (ret);
 
 	for (c = 0; c < rr->rr_firstdatacol; c++) {
 		rc = &rr->rr_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
 
 		orig[c] = rc->rc_abd;
 		ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
 		rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
 	}
 
 	/*
 	 * Verify any empty sectors are zero filled to ensure the parity
 	 * is calculated correctly even if these non-data sectors are damaged.
 	 */
 	if (rr->rr_nempty && rr->rr_abd_empty != NULL)
 		ret += vdev_draid_map_verify_empty(zio, rr);
 
 	/*
 	 * Regenerates parity even for !tried||rc_error!=0 columns.  This
 	 * isn't harmful but it does have the side effect of fixing stuff
 	 * we didn't realize was necessary (i.e. even if we return 0).
 	 */
 	vdev_raidz_generate_parity_row(rm, rr);
 
 	for (c = 0; c < rr->rr_firstdatacol; c++) {
 		rc = &rr->rr_col[c];
 
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
 
 		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
 			zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
 			    c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
 			vdev_raidz_checksum_error(zio, rc, orig[c]);
 			rc->rc_error = SET_ERROR(ECKSUM);
 			ret++;
 		}
 		abd_free(orig[c]);
 	}
 
 	return (ret);
 }
 
 static int
 vdev_raidz_worst_error(raidz_row_t *rr)
 {
 	int error = 0;
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		error = zio_worst_error(error, rr->rr_col[c].rc_error);
 		error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
 	}
 
 	return (error);
 }
 
 static void
 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
 {
 	int unexpected_errors = 0;
 	int parity_errors = 0;
 	int parity_untried = 0;
 	int data_errors = 0;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_error) {
 			if (c < rr->rr_firstdatacol)
 				parity_errors++;
 			else
 				data_errors++;
 
 			if (!rc->rc_skipped)
 				unexpected_errors++;
 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
 			parity_untried++;
 		}
 
 		if (rc->rc_force_repair)
 			unexpected_errors++;
 	}
 
 	/*
 	 * If we read more parity disks than were used for
 	 * reconstruction, confirm that the other parity disks produced
 	 * correct data.
 	 *
 	 * Note that we also regenerate parity when resilvering so we
 	 * can write it out to failed devices later.
 	 */
 	if (parity_errors + parity_untried <
 	    rr->rr_firstdatacol - data_errors ||
 	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
 		int n = raidz_parity_verify(zio, rr);
 		unexpected_errors += n;
 	}
 
 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
 	    (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
 		/*
 		 * Use the good data we have in hand to repair damaged children.
 		 */
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			vdev_t *vd = zio->io_vd;
 			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 			if (!rc->rc_allow_repair) {
 				continue;
 			} else if (!rc->rc_force_repair &&
 			    (rc->rc_error == 0 || rc->rc_size == 0)) {
 				continue;
 			}
 
 			zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
 			    "offset=%llx",
 			    zio, c, rc->rc_devidx, (long long)rc->rc_offset);
 
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    ZIO_TYPE_WRITE,
 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
 	}
 
 	/*
 	 * Scrub or resilver i/o's: overwrite any shadow locations with the
 	 * good data.  This ensures that if we've already copied this sector,
 	 * it will be corrected if it was damaged.  This writes more than is
 	 * necessary, but since expansion is paused during scrub/resilver, at
 	 * most a single row will have a shadow location.
 	 */
 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
 	    (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			vdev_t *vd = zio->io_vd;
 
 			if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
 				continue;
 			vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
 
 			/*
 			 * Note: We don't want to update the repair stats
 			 * because that would incorrectly indicate that there
 			 * was bad data to repair, which we aren't sure about.
 			 * By clearing the SCAN_THREAD flag, we prevent this
 			 * from happening, despite having the REPAIR flag set.
 			 * We need to set SELF_HEAL so that this i/o can't be
 			 * bypassed by zio_vdev_io_start().
 			 */
 			zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
 			    NULL, NULL);
 			cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
 			zio_nowait(cio);
 		}
 	}
 }
 
 static void
 raidz_restore_orig_data(raidz_map_t *rm)
 {
 	for (int i = 0; i < rm->rm_nrows; i++) {
 		raidz_row_t *rr = rm->rm_row[i];
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			if (rc->rc_need_orig_restore) {
 				abd_copy(rc->rc_abd,
 				    rc->rc_orig_data, rc->rc_size);
 				rc->rc_need_orig_restore = B_FALSE;
 			}
 		}
 	}
 }
 
 /*
  * During raidz_reconstruct() for expanded VDEV, we need special consideration
  * failure simulations.  See note in raidz_reconstruct() on simulating failure
  * of a pre-expansion device.
  *
  * Treating logical child i as failed, return TRUE if the given column should
  * be treated as failed.  The idea of logical children allows us to imagine
  * that a disk silently failed before a RAIDZ expansion (reads from this disk
  * succeed but return the wrong data).  Since the expansion doesn't verify
  * checksums, the incorrect data will be moved to new locations spread among
  * the children (going diagonally across them).
  *
  * Higher "logical child failures" (values of `i`) indicate these
  * "pre-expansion failures".  The first physical_width values imagine that a
  * current child failed; the next physical_width-1 values imagine that a
  * child failed before the most recent expansion; the next physical_width-2
  * values imagine a child failed in the expansion before that, etc.
  */
 static boolean_t
 raidz_simulate_failure(int physical_width, int original_width, int ashift,
     int i, raidz_col_t *rc)
 {
 	uint64_t sector_id =
 	    physical_width * (rc->rc_offset >> ashift) +
 	    rc->rc_devidx;
 
 	for (int w = physical_width; w >= original_width; w--) {
 		if (i < w) {
 			return (sector_id % w == i);
 		} else {
 			i -= w;
 		}
 	}
 	ASSERT(!"invalid logical child id");
 	return (B_FALSE);
 }
 
 /*
  * returns EINVAL if reconstruction of the block will not be possible
  * returns ECKSUM if this specific reconstruction failed
  * returns 0 on successful reconstruction
  */
 static int
 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
 {
 	raidz_map_t *rm = zio->io_vsd;
 	int physical_width = zio->io_vd->vdev_children;
 	int original_width = (rm->rm_original_width != 0) ?
 	    rm->rm_original_width : physical_width;
 	int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
 
 	if (dbgmsg) {
 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
 		    "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
 	}
 
 	/* Reconstruct each row */
 	for (int r = 0; r < rm->rm_nrows; r++) {
 		raidz_row_t *rr = rm->rm_row[r];
 		int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
 		int t = 0;
 		int dead = 0;
 		int dead_data = 0;
 
 		if (dbgmsg)
 			zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
 
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			ASSERT0(rc->rc_need_orig_restore);
 			if (rc->rc_error != 0) {
 				dead++;
 				if (c >= nparity)
 					dead_data++;
 				continue;
 			}
 			if (rc->rc_size == 0)
 				continue;
 			for (int lt = 0; lt < ntgts; lt++) {
 				if (raidz_simulate_failure(physical_width,
 				    original_width,
 				    zio->io_vd->vdev_top->vdev_ashift,
 				    ltgts[lt], rc)) {
 					if (rc->rc_orig_data == NULL) {
 						rc->rc_orig_data =
 						    abd_alloc_linear(
 						    rc->rc_size, B_TRUE);
 						abd_copy(rc->rc_orig_data,
 						    rc->rc_abd, rc->rc_size);
 					}
 					rc->rc_need_orig_restore = B_TRUE;
 
 					dead++;
 					if (c >= nparity)
 						dead_data++;
 					/*
 					 * Note: simulating failure of a
 					 * pre-expansion device can hit more
 					 * than one column, in which case we
 					 * might try to simulate more failures
 					 * than can be reconstructed, which is
 					 * also more than the size of my_tgts.
 					 * This check prevents accessing past
 					 * the end of my_tgts.  The "dead >
 					 * nparity" check below will fail this
 					 * reconstruction attempt.
 					 */
 					if (t < VDEV_RAIDZ_MAXPARITY) {
 						my_tgts[t++] = c;
 						if (dbgmsg) {
 							zfs_dbgmsg("simulating "
 							    "failure of col %u "
 							    "devidx %u", c,
 							    (int)rc->rc_devidx);
 						}
 					}
 					break;
 				}
 			}
 		}
 		if (dead > nparity) {
 			/* reconstruction not possible */
 			if (dbgmsg) {
 				zfs_dbgmsg("reconstruction not possible; "
 				    "too many failures");
 			}
 			raidz_restore_orig_data(rm);
 			return (EINVAL);
 		}
 		if (dead_data > 0)
 			vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
 	}
 
 	/* Check for success */
 	if (raidz_checksum_verify(zio) == 0) {
 
 		/* Reconstruction succeeded - report errors */
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 
 			for (int c = 0; c < rr->rr_cols; c++) {
 				raidz_col_t *rc = &rr->rr_col[c];
 				if (rc->rc_need_orig_restore) {
 					/*
 					 * Note: if this is a parity column,
 					 * we don't really know if it's wrong.
 					 * We need to let
 					 * vdev_raidz_io_done_verified() check
 					 * it, and if we set rc_error, it will
 					 * think that it is a "known" error
 					 * that doesn't need to be checked
 					 * or corrected.
 					 */
 					if (rc->rc_error == 0 &&
 					    c >= rr->rr_firstdatacol) {
 						vdev_raidz_checksum_error(zio,
 						    rc, rc->rc_orig_data);
 						rc->rc_error =
 						    SET_ERROR(ECKSUM);
 					}
 					rc->rc_need_orig_restore = B_FALSE;
 				}
 			}
 
 			vdev_raidz_io_done_verified(zio, rr);
 		}
 
 		zio_checksum_verified(zio);
 
 		if (dbgmsg) {
 			zfs_dbgmsg("reconstruction successful "
 			    "(checksum verified)");
 		}
 		return (0);
 	}
 
 	/* Reconstruction failed - restore original data */
 	raidz_restore_orig_data(rm);
 	if (dbgmsg) {
 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
 		    "failed", zio);
 	}
 	return (ECKSUM);
 }
 
 /*
  * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
  * Note that the algorithm below is non-optimal because it doesn't take into
  * account how reconstruction is actually performed. For example, with
  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
  * is targeted as invalid as if columns 1 and 4 are targeted since in both
  * cases we'd only use parity information in column 0.
  *
  * The order that we find the various possible combinations of failed
  * disks is dictated by these rules:
  * - Examine each "slot" (the "i" in tgts[i])
  *   - Try to increment this slot (tgts[i] += 1)
  *   - if we can't increment because it runs into the next slot,
  *     reset our slot to the minimum, and examine the next slot
  *
  *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
  *  3 columns to reconstruct), we will generate the following sequence:
  *
  *  STATE        ACTION
  *  0 1 2        special case: skip since these are all parity
  *  0 1   3      first slot: reset to 0; middle slot: increment to 2
  *  0   2 3      first slot: increment to 1
  *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
  *  0 1     4    first: reset to 0; middle: increment to 2
  *  0   2   4    first: increment to 1
  *    1 2   4    first: reset to 0; middle: increment to 3
  *  0     3 4    first: increment to 1
  *    1   3 4    first: increment to 2
  *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
  *  0 1       5  first: reset to 0; middle: increment to 2
  *  0   2     5  first: increment to 1
  *    1 2     5  first: reset to 0; middle: increment to 3
  *  0     3   5  first: increment to 1
  *    1   3   5  first: increment to 2
  *      2 3   5  first: reset to 0; middle: increment to 4
  *  0       4 5  first: increment to 1
  *    1     4 5  first: increment to 2
  *      2   4 5  first: increment to 3
  *        3 4 5  done
  *
  * This strategy works for dRAID but is less efficient when there are a large
  * number of child vdevs and therefore permutations to check. Furthermore,
  * since the raidz_map_t rows likely do not overlap, reconstruction would be
  * possible as long as there are no more than nparity data errors per row.
  * These additional permutations are not currently checked but could be as
  * a future improvement.
  *
  * Returns 0 on success, ECKSUM on failure.
  */
 static int
 vdev_raidz_combrec(zio_t *zio)
 {
 	int nparity = vdev_get_nparity(zio->io_vd);
 	raidz_map_t *rm = zio->io_vsd;
 	int physical_width = zio->io_vd->vdev_children;
 	int original_width = (rm->rm_original_width != 0) ?
 	    rm->rm_original_width : physical_width;
 
 	for (int i = 0; i < rm->rm_nrows; i++) {
 		raidz_row_t *rr = rm->rm_row[i];
 		int total_errors = 0;
 
 		for (int c = 0; c < rr->rr_cols; c++) {
 			if (rr->rr_col[c].rc_error)
 				total_errors++;
 		}
 
 		if (total_errors > nparity)
 			return (vdev_raidz_worst_error(rr));
 	}
 
 	for (int num_failures = 1; num_failures <= nparity; num_failures++) {
 		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
 		int *ltgts = &tstore[1]; /* value is logical child ID */
 
 
 		/*
 		 * Determine number of logical children, n.  See comment
 		 * above raidz_simulate_failure().
 		 */
 		int n = 0;
 		for (int w = physical_width;
 		    w >= original_width; w--) {
 			n += w;
 		}
 
 		ASSERT3U(num_failures, <=, nparity);
 		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
 
 		/* Handle corner cases in combrec logic */
 		ltgts[-1] = -1;
 		for (int i = 0; i < num_failures; i++) {
 			ltgts[i] = i;
 		}
 		ltgts[num_failures] = n;
 
 		for (;;) {
 			int err = raidz_reconstruct(zio, ltgts, num_failures,
 			    nparity);
 			if (err == EINVAL) {
 				/*
 				 * Reconstruction not possible with this #
 				 * failures; try more failures.
 				 */
 				break;
 			} else if (err == 0)
 				return (0);
 
 			/* Compute next targets to try */
 			for (int t = 0; ; t++) {
 				ASSERT3U(t, <, num_failures);
 				ltgts[t]++;
 				if (ltgts[t] == n) {
 					/* try more failures */
 					ASSERT3U(t, ==, num_failures - 1);
 					if (zfs_flags &
 					    ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
 						zfs_dbgmsg("reconstruction "
 						    "failed for num_failures="
 						    "%u; tried all "
 						    "combinations",
 						    num_failures);
 					}
 					break;
 				}
 
 				ASSERT3U(ltgts[t], <, n);
 				ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
 
 				/*
 				 * If that spot is available, we're done here.
 				 * Try the next combination.
 				 */
 				if (ltgts[t] != ltgts[t + 1])
 					break; // found next combination
 
 				/*
 				 * Otherwise, reset this tgt to the minimum,
 				 * and move on to the next tgt.
 				 */
 				ltgts[t] = ltgts[t - 1] + 1;
 				ASSERT3U(ltgts[t], ==, t);
 			}
 
 			/* Increase the number of failures and keep trying. */
 			if (ltgts[num_failures - 1] == n)
 				break;
 		}
 	}
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruction failed for all num_failures");
 	return (ECKSUM);
 }
 
 void
 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
 {
 	for (uint64_t row = 0; row < rm->rm_nrows; row++) {
 		raidz_row_t *rr = rm->rm_row[row];
 		vdev_raidz_reconstruct_row(rm, rr, t, nt);
 	}
 }
 
 /*
  * Complete a write IO operation on a RAIDZ VDev
  *
  * Outline:
  *   1. Check for errors on the child IOs.
  *   2. Return, setting an error code if too few child VDevs were written
  *      to reconstruct the data later.  Note that partial writes are
  *      considered successful if they can be reconstructed at all.
  */
 static void
 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
 {
 	int normal_errors = 0;
 	int shadow_errors = 0;
 
 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_error != 0) {
 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
 			normal_errors++;
 		}
 		if (rc->rc_shadow_error != 0) {
 			ASSERT(rc->rc_shadow_error != ECKSUM);
 			shadow_errors++;
 		}
 	}
 
 	/*
 	 * Treat partial writes as a success. If we couldn't write enough
 	 * columns to reconstruct the data, the I/O failed.  Otherwise, good
 	 * enough.  Note that in the case of a shadow write (during raidz
 	 * expansion), depending on if we crash, either the normal (old) or
 	 * shadow (new) location may become the "real" version of the block,
 	 * so both locations must have sufficient redundancy.
 	 *
 	 * Now that we support write reallocation, it would be better
 	 * to treat partial failure as real failure unless there are
 	 * no non-degraded top-level vdevs left, and not update DTLs
 	 * if we intend to reallocate.
 	 */
 	if (normal_errors > rr->rr_firstdatacol ||
 	    shadow_errors > rr->rr_firstdatacol) {
 		zio->io_error = zio_worst_error(zio->io_error,
 		    vdev_raidz_worst_error(rr));
 	}
 }
 
 static void
 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
     raidz_row_t *rr)
 {
 	int parity_errors = 0;
 	int parity_untried = 0;
 	int data_errors = 0;
 	int total_errors = 0;
 
 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		/*
 		 * If scrubbing and a replacing/sparing child vdev determined
 		 * that not all of its children have an identical copy of the
 		 * data, then clear the error so the column is treated like
 		 * any other read and force a repair to correct the damage.
 		 */
 		if (rc->rc_error == ECKSUM) {
 			ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
 			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
 			rc->rc_force_repair = 1;
 			rc->rc_error = 0;
 		}
 
 		if (rc->rc_error) {
 			if (c < rr->rr_firstdatacol)
 				parity_errors++;
 			else
 				data_errors++;
 
 			total_errors++;
 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
 			parity_untried++;
 		}
 	}
 
 	/*
 	 * If there were data errors and the number of errors we saw was
 	 * correctable -- less than or equal to the number of parity disks read
 	 * -- reconstruct based on the missing data.
 	 */
 	if (data_errors != 0 &&
 	    total_errors <= rr->rr_firstdatacol - parity_untried) {
 		/*
 		 * We either attempt to read all the parity columns or
 		 * none of them. If we didn't try to read parity, we
 		 * wouldn't be here in the correctable case. There must
 		 * also have been fewer parity errors than parity
 		 * columns or, again, we wouldn't be in this code path.
 		 */
 		ASSERT(parity_untried == 0);
 		ASSERT(parity_errors < rr->rr_firstdatacol);
 
 		/*
 		 * Identify the data columns that reported an error.
 		 */
 		int n = 0;
 		int tgts[VDEV_RAIDZ_MAXPARITY];
 		for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			if (rc->rc_error != 0) {
 				ASSERT(n < VDEV_RAIDZ_MAXPARITY);
 				tgts[n++] = c;
 			}
 		}
 
 		ASSERT(rr->rr_firstdatacol >= n);
 
 		vdev_raidz_reconstruct_row(rm, rr, tgts, n);
 	}
 }
 
 /*
  * Return the number of reads issued.
  */
 static int
 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
 {
 	vdev_t *vd = zio->io_vd;
 	int nread = 0;
 
 	rr->rr_missingdata = 0;
 	rr->rr_missingparity = 0;
 
 	/*
 	 * If this rows contains empty sectors which are not required
 	 * for a normal read then allocate an ABD for them now so they
 	 * may be read, verified, and any needed repairs performed.
 	 */
 	if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
 		vdev_draid_map_alloc_empty(zio, rr);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		if (rc->rc_tried || rc->rc_size == 0)
 			continue;
 
 		zio_nowait(zio_vdev_child_io(zio, NULL,
 		    vd->vdev_child[rc->rc_devidx],
 		    rc->rc_offset, rc->rc_abd, rc->rc_size,
 		    zio->io_type, zio->io_priority, 0,
 		    vdev_raidz_child_done, rc));
 		nread++;
 	}
 	return (nread);
 }
 
 /*
  * We're here because either there were too many errors to even attempt
  * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
  * failed. In either case, there is enough bad data to prevent reconstruction.
  * Start checksum ereports for all children which haven't failed.
  */
 static void
 vdev_raidz_io_done_unrecoverable(zio_t *zio)
 {
 	raidz_map_t *rm = zio->io_vsd;
 
 	for (int i = 0; i < rm->rm_nrows; i++) {
 		raidz_row_t *rr = rm->rm_row[i];
 
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
 
 			if (rc->rc_error != 0)
 				continue;
 
 			zio_bad_cksum_t zbc;
 			zbc.zbc_has_cksum = 0;
 			zbc.zbc_injected = rm->rm_ecksuminjected;
 
 			mutex_enter(&cvd->vdev_stat_lock);
 			cvd->vdev_stat.vs_checksum_errors++;
 			mutex_exit(&cvd->vdev_stat_lock);
 			(void) zfs_ereport_start_checksum(zio->io_spa,
 			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
 			    rc->rc_size, &zbc);
 		}
 	}
 }
 
 void
 vdev_raidz_io_done(zio_t *zio)
 {
 	raidz_map_t *rm = zio->io_vsd;
 
 	ASSERT(zio->io_bp != NULL);
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
 		}
 	} else {
 		if (rm->rm_phys_col) {
 			/*
 			 * This is an aggregated read.  Copy the data and status
 			 * from the aggregate abd's to the individual rows.
 			 */
 			for (int i = 0; i < rm->rm_nrows; i++) {
 				raidz_row_t *rr = rm->rm_row[i];
 
 				for (int c = 0; c < rr->rr_cols; c++) {
 					raidz_col_t *rc = &rr->rr_col[c];
 					if (rc->rc_tried || rc->rc_size == 0)
 						continue;
 
 					raidz_col_t *prc =
 					    &rm->rm_phys_col[rc->rc_devidx];
 					rc->rc_error = prc->rc_error;
 					rc->rc_tried = prc->rc_tried;
 					rc->rc_skipped = prc->rc_skipped;
 					if (c >= rr->rr_firstdatacol) {
 						/*
 						 * Note: this is slightly faster
 						 * than using abd_copy_off().
 						 */
 						char *physbuf = abd_to_buf(
 						    prc->rc_abd);
 						void *physloc = physbuf +
 						    rc->rc_offset -
 						    prc->rc_offset;
 
 						abd_copy_from_buf(rc->rc_abd,
 						    physloc, rc->rc_size);
 					}
 				}
 			}
 		}
 
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			vdev_raidz_io_done_reconstruct_known_missing(zio,
 			    rm, rr);
 		}
 
 		if (raidz_checksum_verify(zio) == 0) {
 			for (int i = 0; i < rm->rm_nrows; i++) {
 				raidz_row_t *rr = rm->rm_row[i];
 				vdev_raidz_io_done_verified(zio, rr);
 			}
 			zio_checksum_verified(zio);
 		} else {
 			/*
 			 * A sequential resilver has no checksum which makes
 			 * combinatoral reconstruction impossible. This code
 			 * path is unreachable since raidz_checksum_verify()
 			 * has no checksum to verify and must succeed.
 			 */
 			ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
 
 			/*
 			 * This isn't a typical situation -- either we got a
 			 * read error or a child silently returned bad data.
 			 * Read every block so we can try again with as much
 			 * data and parity as we can track down. If we've
 			 * already been through once before, all children will
 			 * be marked as tried so we'll proceed to combinatorial
 			 * reconstruction.
 			 */
 			int nread = 0;
 			for (int i = 0; i < rm->rm_nrows; i++) {
 				nread += vdev_raidz_read_all(zio,
 				    rm->rm_row[i]);
 			}
 			if (nread != 0) {
 				/*
 				 * Normally our stage is VDEV_IO_DONE, but if
 				 * we've already called redone(), it will have
 				 * changed to VDEV_IO_START, in which case we
 				 * don't want to call redone() again.
 				 */
 				if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
 					zio_vdev_io_redone(zio);
 				return;
 			}
 			/*
 			 * It would be too expensive to try every possible
 			 * combination of failed sectors in every row, so
 			 * instead we try every combination of failed current or
 			 * past physical disk. This means that if the incorrect
 			 * sectors were all on Nparity disks at any point in the
 			 * past, we will find the correct data.  The only known
 			 * case where this is less durable than a non-expanded
 			 * RAIDZ, is if we have a silent failure during
 			 * expansion.  In that case, one block could be
 			 * partially in the old format and partially in the
 			 * new format, so we'd lost some sectors from the old
 			 * format and some from the new format.
 			 *
 			 * e.g. logical_width=4 physical_width=6
 			 * the 15 (6+5+4) possible failed disks are:
 			 * width=6 child=0
 			 * width=6 child=1
 			 * width=6 child=2
 			 * width=6 child=3
 			 * width=6 child=4
 			 * width=6 child=5
 			 * width=5 child=0
 			 * width=5 child=1
 			 * width=5 child=2
 			 * width=5 child=3
 			 * width=5 child=4
 			 * width=4 child=0
 			 * width=4 child=1
 			 * width=4 child=2
 			 * width=4 child=3
 			 * And we will try every combination of Nparity of these
 			 * failing.
 			 *
 			 * As a first pass, we can generate every combo,
 			 * and try reconstructing, ignoring any known
 			 * failures.  If any row has too many known + simulated
 			 * failures, then we bail on reconstructing with this
 			 * number of simulated failures.  As an improvement,
 			 * we could detect the number of whole known failures
 			 * (i.e. we have known failures on these disks for
 			 * every row; the disks never succeeded), and
 			 * subtract that from the max # failures to simulate.
 			 * We could go even further like the current
 			 * combrec code, but that doesn't seem like it
 			 * gains us very much.  If we simulate a failure
 			 * that is also a known failure, that's fine.
 			 */
 			zio->io_error = vdev_raidz_combrec(zio);
 			if (zio->io_error == ECKSUM &&
 			    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 				vdev_raidz_io_done_unrecoverable(zio);
 			}
 		}
 	}
 	if (rm->rm_lr != NULL) {
 		zfs_rangelock_exit(rm->rm_lr);
 		rm->rm_lr = NULL;
 	}
 }
 
 static void
 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	if (faulted > vdrz->vd_nparity)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_NO_REPLICAS);
 	else if (degraded + faulted != 0)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	else
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
 /*
  * Determine if any portion of the provided block resides on a child vdev
  * with a dirty DTL and therefore needs to be resilvered.  The function
  * assumes that at least one DTL is dirty which implies that full stripe
  * width blocks must be resilvered.
  */
 static boolean_t
 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 
 	/*
 	 * If we're in the middle of a RAIDZ expansion, this block may be in
 	 * the old and/or new location.  For simplicity, always resilver it.
 	 */
 	if (vdrz->vn_vre.vre_state == DSS_SCANNING)
 		return (B_TRUE);
 
 	uint64_t dcols = vd->vdev_children;
 	uint64_t nparity = vdrz->vd_nparity;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	/* The starting RAIDZ (parent) vdev sector of the block. */
 	uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
 	/* The zio's size in units of the vdev's minimum sector size. */
 	uint64_t s = ((psize - 1) >> ashift) + 1;
 	/* The first column for this stripe. */
 	uint64_t f = b % dcols;
 
 	/* Unreachable by sequential resilver. */
 	ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
 
 	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
 		return (B_FALSE);
 
 	if (s + nparity >= dcols)
 		return (B_TRUE);
 
 	for (uint64_t c = 0; c < s + nparity; c++) {
 		uint64_t devidx = (f + c) % dcols;
 		vdev_t *cvd = vd->vdev_child[devidx];
 
 		/*
 		 * dsl_scan_need_resilver() already checked vd with
 		 * vdev_dtl_contains(). So here just check cvd with
 		 * vdev_dtl_empty(), cheaper and a good approximation.
 		 */
 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 static void
 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
 {
 	(void) remain_rs;
 
 	vdev_t *raidvd = cvd->vdev_parent;
 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
 
 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
 
 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
 		/*
 		 * We're in the middle of expansion, in which case the
 		 * translation is in flux.  Any answer we give may be wrong
 		 * by the time we return, so it isn't safe for the caller to
 		 * act on it.  Therefore we say that this range isn't present
 		 * on any children.  The only consumers of this are "zpool
 		 * initialize" and trimming, both of which are "best effort"
 		 * anyway.
 		 */
 		physical_rs->rs_start = physical_rs->rs_end = 0;
 		remain_rs->rs_start = remain_rs->rs_end = 0;
 		return;
 	}
 
 	uint64_t width = vdrz->vd_physical_width;
 	uint64_t tgt_col = cvd->vdev_id;
 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
 
 	/* make sure the offsets are block-aligned */
 	ASSERT0(logical_rs->rs_start % (1 << ashift));
 	ASSERT0(logical_rs->rs_end % (1 << ashift));
 	uint64_t b_start = logical_rs->rs_start >> ashift;
 	uint64_t b_end = logical_rs->rs_end >> ashift;
 
 	uint64_t start_row = 0;
 	if (b_start > tgt_col) /* avoid underflow */
 		start_row = ((b_start - tgt_col - 1) / width) + 1;
 
 	uint64_t end_row = 0;
 	if (b_end > tgt_col)
 		end_row = ((b_end - tgt_col - 1) / width) + 1;
 
 	physical_rs->rs_start = start_row << ashift;
 	physical_rs->rs_end = end_row << ashift;
 
 	ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
 	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
 	    logical_rs->rs_end - logical_rs->rs_start);
 }
 
 static void
 raidz_reflow_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = arg;
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 
 	/*
 	 * Ensure there are no i/os to the range that is being committed.
 	 */
 	uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
 	ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
 
 	mutex_enter(&vre->vre_lock);
 	uint64_t new_offset =
 	    MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
 	/*
 	 * We should not have committed anything that failed.
 	 */
 	VERIFY3U(vre->vre_failed_offset, >=, old_offset);
 	mutex_exit(&vre->vre_lock);
 
 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
 	    old_offset, new_offset - old_offset,
 	    RL_WRITER);
 
 	/*
 	 * Update the uberblock that will be written when this txg completes.
 	 */
 	RAIDZ_REFLOW_SET(&spa->spa_uberblock,
 	    RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
 	vre->vre_offset_pertxg[txgoff] = 0;
 	zfs_rangelock_exit(lr);
 
 	mutex_enter(&vre->vre_lock);
 	vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
 	vre->vre_bytes_copied_pertxg[txgoff] = 0;
 	mutex_exit(&vre->vre_lock);
 
 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
 	    sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
 }
 
 static void
 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = arg;
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
 
 	for (int i = 0; i < TXG_SIZE; i++)
 		VERIFY0(vre->vre_offset_pertxg[i]);
 
 	reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
 	re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
 	re->re_logical_width = vdrz->vd_physical_width;
 	mutex_enter(&vdrz->vd_expand_lock);
 	avl_add(&vdrz->vd_expand_txgs, re);
 	mutex_exit(&vdrz->vd_expand_lock);
 
 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
 
 	/*
 	 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
 	 * will get written (based on vd_expand_txgs).
 	 */
 	vdev_config_dirty(vd);
 
 	/*
 	 * Before we change vre_state, the on-disk state must reflect that we
 	 * have completed all copying, so that vdev_raidz_io_start() can use
 	 * vre_state to determine if the reflow is in progress.  See also the
 	 * end of spa_raidz_expand_thread().
 	 */
 	VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
 	    raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
 
 	vre->vre_end_time = gethrestime_sec();
 	vre->vre_state = DSS_FINISHED;
 
 	uint64_t state = vre->vre_state;
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
 	    sizeof (state), 1, &state, tx));
 
 	uint64_t end_time = vre->vre_end_time;
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
 	    sizeof (end_time), 1, &end_time, tx));
 
 	spa->spa_uberblock.ub_raidz_reflow_info = 0;
 
 	spa_history_log_internal(spa, "raidz vdev expansion completed",  tx,
 	    "%s vdev %llu new width %llu", spa_name(spa),
 	    (unsigned long long)vd->vdev_id,
 	    (unsigned long long)vd->vdev_children);
 
 	spa->spa_raidz_expand = NULL;
 	raidvd->vdev_rz_expanding = B_FALSE;
 
 	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
 	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
 	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
 
 	spa_notify_waiters(spa);
 
 	/*
 	 * While we're in syncing context take the opportunity to
 	 * setup a scrub. All the data has been sucessfully copied
 	 * but we have not validated any checksums.
 	 */
 	pool_scan_func_t func = POOL_SCAN_SCRUB;
 	if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0)
 		dsl_scan_setup_sync(&func, tx);
 }
 
 /*
  * Struct for one copy zio.
  */
 typedef struct raidz_reflow_arg {
 	vdev_raidz_expand_t *rra_vre;
 	zfs_locked_range_t *rra_lr;
 	uint64_t rra_txg;
 } raidz_reflow_arg_t;
 
 /*
  * The write of the new location is done.
  */
 static void
 raidz_reflow_write_done(zio_t *zio)
 {
 	raidz_reflow_arg_t *rra = zio->io_private;
 	vdev_raidz_expand_t *vre = rra->rra_vre;
 
 	abd_free(zio->io_abd);
 
 	mutex_enter(&vre->vre_lock);
 	if (zio->io_error != 0) {
 		/* Force a reflow pause on errors */
 		vre->vre_failed_offset =
 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
 	}
 	ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
 	vre->vre_outstanding_bytes -= zio->io_size;
 	if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
 	    vre->vre_failed_offset) {
 		vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
 		    zio->io_size;
 	}
 	cv_signal(&vre->vre_cv);
 	mutex_exit(&vre->vre_lock);
 
 	zfs_rangelock_exit(rra->rra_lr);
 
 	kmem_free(rra, sizeof (*rra));
 	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
 }
 
 /*
  * The read of the old location is done.  The parent zio is the write to
  * the new location.  Allow it to start.
  */
 static void
 raidz_reflow_read_done(zio_t *zio)
 {
 	raidz_reflow_arg_t *rra = zio->io_private;
 	vdev_raidz_expand_t *vre = rra->rra_vre;
 
 	/*
 	 * If the read failed, or if it was done on a vdev that is not fully
 	 * healthy (e.g. a child that has a resilver in progress), we may not
 	 * have the correct data.  Note that it's OK if the write proceeds.
 	 * It may write garbage but the location is otherwise unused and we
 	 * will retry later due to vre_failed_offset.
 	 */
 	if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
 		zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
 		    "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
 		    (long long)rra->rra_lr->lr_offset,
 		    (long long)rra->rra_lr->lr_length,
 		    (long long)rra->rra_txg,
 		    zio->io_error,
 		    vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
 		    vdev_dtl_empty(zio->io_vd, DTL_MISSING));
 		mutex_enter(&vre->vre_lock);
 		/* Force a reflow pause on errors */
 		vre->vre_failed_offset =
 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
 		mutex_exit(&vre->vre_lock);
 	}
 
 	zio_nowait(zio_unique_parent(zio));
 }
 
 static void
 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
     dmu_tx_t *tx)
 {
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	if (offset == 0)
 		return;
 
 	mutex_enter(&vre->vre_lock);
 	ASSERT3U(vre->vre_offset, <=, offset);
 	vre->vre_offset = offset;
 	mutex_exit(&vre->vre_lock);
 
 	if (vre->vre_offset_pertxg[txgoff] == 0) {
 		dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
 		    spa, tx);
 	}
 	vre->vre_offset_pertxg[txgoff] = offset;
 }
 
 static boolean_t
 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
 {
 	for (int i = 0; i < raidz_vd->vdev_children; i++) {
 		/* Quick check if a child is being replaced */
 		if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static boolean_t
 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
     dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	int ashift = vd->vdev_top->vdev_ashift;
 	uint64_t offset, size;
 
 	if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize,
 	    &offset, &size)) {
 		return (B_FALSE);
 	}
 	ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
 	ASSERT3U(size, >=, 1 << ashift);
 	uint64_t length = 1 << ashift;
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 
 	uint64_t blkid = offset >> ashift;
 
 	int old_children = vd->vdev_children - 1;
 
 	/*
 	 * We can only progress to the point that writes will not overlap
 	 * with blocks whose progress has not yet been recorded on disk.
 	 * Since partially-copied rows are still read from the old location,
 	 * we need to stop one row before the sector-wise overlap, to prevent
 	 * row-wise overlap.
 	 *
 	 * Note that even if we are skipping over a large unallocated region,
 	 * we can't move the on-disk progress to `offset`, because concurrent
 	 * writes/allocations could still use the currently-unallocated
 	 * region.
 	 */
 	uint64_t ubsync_blkid =
 	    RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
 	uint64_t next_overwrite_blkid = ubsync_blkid +
 	    ubsync_blkid / old_children - old_children;
 	VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
 
 	if (blkid >= next_overwrite_blkid) {
 		raidz_reflow_record_progress(vre,
 		    next_overwrite_blkid << ashift, tx);
 		return (B_TRUE);
 	}
 
 	range_tree_remove(rt, offset, length);
 
 	raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP);
 	rra->rra_vre = vre;
 	rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
 	    offset, length, RL_WRITER);
 	rra->rra_txg = dmu_tx_get_txg(tx);
 
 	raidz_reflow_record_progress(vre, offset + length, tx);
 
 	mutex_enter(&vre->vre_lock);
 	vre->vre_outstanding_bytes += length;
 	mutex_exit(&vre->vre_lock);
 
 	/*
 	 * SCL_STATE will be released when the read and write are done,
 	 * by raidz_reflow_write_done().
 	 */
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 
 	/* check if a replacing vdev was added, if so treat it as an error */
 	if (vdev_raidz_expand_child_replacing(vd)) {
 		zfs_dbgmsg("replacing vdev encountered, reflow paused at "
 		    "offset=%llu txg=%llu",
 		    (long long)rra->rra_lr->lr_offset,
 		    (long long)rra->rra_txg);
 
 		mutex_enter(&vre->vre_lock);
 		vre->vre_failed_offset =
 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
 		cv_signal(&vre->vre_cv);
 		mutex_exit(&vre->vre_lock);
 
 		/* drop everything we acquired */
 		zfs_rangelock_exit(rra->rra_lr);
 		kmem_free(rra, sizeof (*rra));
 		spa_config_exit(spa, SCL_STATE, spa);
 		return (B_TRUE);
 	}
 
 	zio_t *pio = spa->spa_txg_zio[txgoff];
 	abd_t *abd = abd_alloc_for_io(length, B_FALSE);
 	zio_t *write_zio = zio_vdev_child_io(pio, NULL,
 	    vd->vdev_child[blkid % vd->vdev_children],
 	    (blkid / vd->vdev_children) << ashift,
 	    abd, length,
 	    ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
 	    ZIO_FLAG_CANFAIL,
 	    raidz_reflow_write_done, rra);
 
 	zio_nowait(zio_vdev_child_io(write_zio, NULL,
 	    vd->vdev_child[blkid % old_children],
 	    (blkid / old_children) << ashift,
 	    abd, length,
 	    ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
 	    ZIO_FLAG_CANFAIL,
 	    raidz_reflow_read_done, rra));
 
 	return (B_FALSE);
 }
 
 /*
  * For testing (ztest specific)
  */
 static void
 raidz_expand_pause(uint_t pause_point)
 {
 	while (raidz_expand_pause_point != 0 &&
 	    raidz_expand_pause_point <= pause_point)
 		delay(hz);
 }
 
 static void
 raidz_scratch_child_done(zio_t *zio)
 {
 	zio_t *pio = zio->io_private;
 
 	mutex_enter(&pio->io_lock);
 	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
 	mutex_exit(&pio->io_lock);
 }
 
 /*
  * Reflow the beginning portion of the vdev into an intermediate scratch area
  * in memory and on disk. This operation must be persisted on disk before we
  * proceed to overwrite the beginning portion with the reflowed data.
  *
  * This multi-step task can fail to complete if disk errors are encountered
  * and we can return here after a pause (waiting for disk to become healthy).
  */
 static void
 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
 {
 	vdev_raidz_expand_t *vre = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	zio_t *pio;
 	int error;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	int ashift = raidvd->vdev_ashift;
 	uint64_t write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << ashift);
 	uint64_t logical_size = write_size * raidvd->vdev_children;
 	uint64_t read_size =
 	    P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
 	    1 << ashift);
 
 	/*
 	 * The scratch space must be large enough to get us to the point
 	 * that one row does not overlap itself when moved.  This is checked
 	 * by vdev_raidz_attach_check().
 	 */
 	VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
 	VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
 	VERIFY3U(write_size, <=, read_size);
 
 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
 	    0, logical_size, RL_WRITER);
 
 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
 	    KM_SLEEP);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		abds[i] = abd_alloc_linear(read_size, B_FALSE);
 	}
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
 
 	/*
 	 * If we have already written the scratch area then we must read from
 	 * there, since new writes were redirected there while we were paused
 	 * or the original location may have been partially overwritten with
 	 * reflowed data.
 	 */
 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
 		VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
 		/*
 		 * Read from scratch space.
 		 */
 		pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 		for (int i = 0; i < raidvd->vdev_children; i++) {
 			/*
 			 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
 			 * to the offset to calculate the physical offset to
 			 * write to.  Passing in a negative offset makes us
 			 * access the scratch area.
 			 */
 			zio_nowait(zio_vdev_child_io(pio, NULL,
 			    raidvd->vdev_child[i],
 			    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
 			    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ,
 			    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
 		}
 		error = zio_wait(pio);
 		if (error != 0) {
 			zfs_dbgmsg("reflow: error %d reading scratch location",
 			    error);
 			goto io_error_exit;
 		}
 		goto overwrite;
 	}
 
 	/*
 	 * Read from original location.
 	 */
 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	for (int i = 0; i < raidvd->vdev_children - 1; i++) {
 		ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    0, abds[i], read_size, ZIO_TYPE_READ,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
 		    raidz_scratch_child_done, pio));
 	}
 	error = zio_wait(pio);
 	if (error != 0) {
 		zfs_dbgmsg("reflow: error %d reading original location", error);
 io_error_exit:
 		for (int i = 0; i < raidvd->vdev_children; i++)
 			abd_free(abds[i]);
 		kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
 		zfs_rangelock_exit(lr);
 		spa_config_exit(spa, SCL_STATE, FTAG);
 		return;
 	}
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
 
 	/*
 	 * Reflow in memory.
 	 */
 	uint64_t logical_sectors = logical_size >> ashift;
 	for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
 		int oldchild = i % (raidvd->vdev_children - 1);
 		uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
 
 		int newchild = i % raidvd->vdev_children;
 		uint64_t newoff = (i / raidvd->vdev_children) << ashift;
 
 		/* a single sector should not be copying over itself */
 		ASSERT(!(newchild == oldchild && newoff == oldoff));
 
 		abd_copy_off(abds[newchild], abds[oldchild],
 		    newoff, oldoff, 1 << ashift);
 	}
 
 	/*
 	 * Verify that we filled in everything we intended to (write_size on
 	 * each child).
 	 */
 	VERIFY0(logical_sectors % raidvd->vdev_children);
 	VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
 	    write_size);
 
 	/*
 	 * Write to scratch location (boot area).
 	 */
 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		/*
 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
 		 * the offset to calculate the physical offset to write to.
 		 * Passing in a negative offset lets us access the boot area.
 		 */
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
 		    write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
 		    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
 	}
 	error = zio_wait(pio);
 	if (error != 0) {
 		zfs_dbgmsg("reflow: error %d writing scratch location", error);
 		goto io_error_exit;
 	}
 	pio = zio_root(spa, NULL, NULL, 0);
 	zio_flush(pio, raidvd);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
 	    (long long)logical_size);
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
 
 	/*
 	 * Update uberblock to indicate that scratch space is valid.  This is
 	 * needed because after this point, the real location may be
 	 * overwritten.  If we crash, we need to get the data from the
 	 * scratch space, rather than the real location.
 	 *
 	 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
 	 * will prefer this uberblock.
 	 */
 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
 	spa->spa_ubsync.ub_timestamp++;
 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
 	if (spa_multihost(spa))
 		mmp_update_uberblock(spa, &spa->spa_ubsync);
 
 	zfs_dbgmsg("reflow: uberblock updated "
 	    "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
 	    (long long)spa->spa_ubsync.ub_txg,
 	    (long long)logical_size,
 	    (long long)spa->spa_ubsync.ub_timestamp);
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
 
 	/*
 	 * Overwrite with reflow'ed data.
 	 */
 overwrite:
 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL,
 		    raidz_scratch_child_done, pio));
 	}
 	error = zio_wait(pio);
 	if (error != 0) {
 		/*
 		 * When we exit early here and drop the range lock, new
 		 * writes will go into the scratch area so we'll need to
 		 * read from there when we return after pausing.
 		 */
 		zfs_dbgmsg("reflow: error %d writing real location", error);
 		/*
 		 * Update the uberblock that is written when this txg completes.
 		 */
 		RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
 		    logical_size);
 		goto io_error_exit;
 	}
 	pio = zio_root(spa, NULL, NULL, 0);
 	zio_flush(pio, raidvd);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
 	    (long long)logical_size);
 	for (int i = 0; i < raidvd->vdev_children; i++)
 		abd_free(abds[i]);
 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
 
 	/*
 	 * Update uberblock to indicate that the initial part has been
 	 * reflow'ed.  This is needed because after this point (when we exit
 	 * the rangelock), we allow regular writes to this region, which will
 	 * be written to the new location only (because reflow_offset_next ==
 	 * reflow_offset_synced).  If we crashed and re-copied from the
 	 * scratch space, we would lose the regular writes.
 	 */
 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
 	    logical_size);
 	spa->spa_ubsync.ub_timestamp++;
 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
 	if (spa_multihost(spa))
 		mmp_update_uberblock(spa, &spa->spa_ubsync);
 
 	zfs_dbgmsg("reflow: uberblock updated "
 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
 	    (long long)spa->spa_ubsync.ub_txg,
 	    (long long)logical_size,
 	    (long long)spa->spa_ubsync.ub_timestamp);
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
 
 	/*
 	 * Update progress.
 	 */
 	vre->vre_offset = logical_size;
 	zfs_rangelock_exit(lr);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
 	/*
 	 * Note - raidz_reflow_sync() will update the uberblock state to
 	 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
 	 */
 	raidz_reflow_sync(spa, tx);
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
 }
 
 /*
  * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
  * here.  No other i/o can be in progress, so we don't need the vre_rangelock.
  */
 void
 vdev_raidz_reflow_copy_scratch(spa_t *spa)
 {
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 	uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
 	ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	ASSERT0(logical_size % raidvd->vdev_children);
 	uint64_t write_size = logical_size / raidvd->vdev_children;
 
 	zio_t *pio;
 
 	/*
 	 * Read from scratch space.
 	 */
 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
 	    KM_SLEEP);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		abds[i] = abd_alloc_linear(write_size, B_FALSE);
 	}
 
 	pio = zio_root(spa, NULL, NULL, 0);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		/*
 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
 		 * the offset to calculate the physical offset to write to.
 		 * Passing in a negative offset lets us access the boot area.
 		 */
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
 		    write_size, ZIO_TYPE_READ,
 		    ZIO_PRIORITY_ASYNC_READ, 0,
 		    raidz_scratch_child_done, pio));
 	}
 	zio_wait(pio);
 
 	/*
 	 * Overwrite real location with reflow'ed data.
 	 */
 	pio = zio_root(spa, NULL, NULL, 0);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
 		    ZIO_PRIORITY_ASYNC_WRITE, 0,
 		    raidz_scratch_child_done, pio));
 	}
 	zio_wait(pio);
 	pio = zio_root(spa, NULL, NULL, 0);
 	zio_flush(pio, raidvd);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
 	    "to real location", (long long)logical_size);
 
 	for (int i = 0; i < raidvd->vdev_children; i++)
 		abd_free(abds[i]);
 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
 
 	/*
 	 * Update uberblock.
 	 */
 	RAIDZ_REFLOW_SET(&spa->spa_ubsync,
 	    RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
 	spa->spa_ubsync.ub_timestamp++;
 	VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
 	if (spa_multihost(spa))
 		mmp_update_uberblock(spa, &spa->spa_ubsync);
 
 	zfs_dbgmsg("reflow recovery: uberblock updated "
 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
 	    (long long)spa->spa_ubsync.ub_txg,
 	    (long long)logical_size,
 	    (long long)spa->spa_ubsync.ub_timestamp);
 
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
 	    spa_first_txg(spa));
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	vre->vre_offset = logical_size;
 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
 	/*
 	 * Note that raidz_reflow_sync() will update the uberblock once more
 	 */
 	raidz_reflow_sync(spa, tx);
 
 	dmu_tx_commit(tx);
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 }
 
 static boolean_t
 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
 {
 	(void) zthr;
 	spa_t *spa = arg;
 
 	return (spa->spa_raidz_expand != NULL &&
 	    !spa->spa_raidz_expand->vre_waiting_for_resilver);
 }
 
 /*
  * RAIDZ expansion background thread
  *
  * Can be called multiple times if the reflow is paused
  */
 static void
 spa_raidz_expand_thread(void *arg, zthr_t *zthr)
 {
 	spa_t *spa = arg;
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 
 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
 		vre->vre_offset = 0;
 	else
 		vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
 
 	/* Reflow the begining portion using the scratch area */
 	if (vre->vre_offset == 0) {
 		VERIFY0(dsl_sync_task(spa_name(spa),
 		    NULL, raidz_reflow_scratch_sync,
 		    vre, 0, ZFS_SPACE_CHECK_NONE));
 
 		/* if we encountered errors then pause */
 		if (vre->vre_offset == 0) {
 			mutex_enter(&vre->vre_lock);
 			vre->vre_waiting_for_resilver = B_TRUE;
 			mutex_exit(&vre->vre_lock);
 			return;
 		}
 	}
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 
 	uint64_t guid = raidvd->vdev_guid;
 
 	/* Iterate over all the remaining metaslabs */
 	for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
 	    i < raidvd->vdev_ms_count &&
 	    !zthr_iscancelled(zthr) &&
 	    vre->vre_failed_offset == UINT64_MAX; i++) {
 		metaslab_t *msp = raidvd->vdev_ms[i];
 
 		metaslab_disable(msp);
 		mutex_enter(&msp->ms_lock);
 
 		/*
 		 * The metaslab may be newly created (for the expanded
 		 * space), in which case its trees won't exist yet,
 		 * so we need to bail out early.
 		 */
 		if (msp->ms_new) {
 			mutex_exit(&msp->ms_lock);
 			metaslab_enable(msp, B_FALSE, B_FALSE);
 			continue;
 		}
 
 		VERIFY0(metaslab_load(msp));
 
 		/*
 		 * We want to copy everything except the free (allocatable)
 		 * space.  Note that there may be a little bit more free
 		 * space (e.g. in ms_defer), and it's fine to copy that too.
 		 */
 		range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64,
 		    NULL, 0, 0);
 		range_tree_add(rt, msp->ms_start, msp->ms_size);
 		range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
 		mutex_exit(&msp->ms_lock);
 
 		/*
 		 * Force the last sector of each metaslab to be copied.  This
 		 * ensures that we advance the on-disk progress to the end of
 		 * this metaslab while the metaslab is disabled.  Otherwise, we
 		 * could move past this metaslab without advancing the on-disk
 		 * progress, and then an allocation to this metaslab would not
 		 * be copied.
 		 */
 		int sectorsz = 1 << raidvd->vdev_ashift;
 		uint64_t ms_last_offset = msp->ms_start +
 		    msp->ms_size - sectorsz;
 		if (!range_tree_contains(rt, ms_last_offset, sectorsz)) {
 			range_tree_add(rt, ms_last_offset, sectorsz);
 		}
 
 		/*
 		 * When we are resuming from a paused expansion (i.e.
 		 * when importing a pool with a expansion in progress),
 		 * discard any state that we have already processed.
 		 */
 		range_tree_clear(rt, 0, vre->vre_offset);
 
 		while (!zthr_iscancelled(zthr) &&
 		    !range_tree_is_empty(rt) &&
 		    vre->vre_failed_offset == UINT64_MAX) {
 
 			/*
 			 * We need to periodically drop the config lock so that
 			 * writers can get in.  Additionally, we can't wait
 			 * for a txg to sync while holding a config lock
 			 * (since a waiting writer could cause a 3-way deadlock
 			 * with the sync thread, which also gets a config
 			 * lock for reader).  So we can't hold the config lock
 			 * while calling dmu_tx_assign().
 			 */
 			spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 			/*
 			 * If requested, pause the reflow when the amount
 			 * specified by raidz_expand_max_reflow_bytes is reached
 			 *
 			 * This pause is only used during testing or debugging.
 			 */
 			while (raidz_expand_max_reflow_bytes != 0 &&
 			    raidz_expand_max_reflow_bytes <=
 			    vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
 				delay(hz);
 			}
 
 			mutex_enter(&vre->vre_lock);
 			while (vre->vre_outstanding_bytes >
 			    raidz_expand_max_copy_bytes) {
 				cv_wait(&vre->vre_cv, &vre->vre_lock);
 			}
 			mutex_exit(&vre->vre_lock);
 
 			dmu_tx_t *tx =
 			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 
 			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 			uint64_t txg = dmu_tx_get_txg(tx);
 
 			/*
 			 * Reacquire the vdev_config lock.  Theoretically, the
 			 * vdev_t that we're expanding may have changed.
 			 */
 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 			raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 
 			boolean_t needsync =
 			    raidz_reflow_impl(raidvd, vre, rt, tx);
 
 			dmu_tx_commit(tx);
 
 			if (needsync) {
 				spa_config_exit(spa, SCL_CONFIG, FTAG);
 				txg_wait_synced(spa->spa_dsl_pool, txg);
 				spa_config_enter(spa, SCL_CONFIG, FTAG,
 				    RW_READER);
 			}
 		}
 
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 		metaslab_enable(msp, B_FALSE, B_FALSE);
 		range_tree_vacate(rt, NULL, NULL);
 		range_tree_destroy(rt);
 
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	}
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	/*
 	 * The txg_wait_synced() here ensures that all reflow zio's have
 	 * completed, and vre_failed_offset has been set if necessary.  It
 	 * also ensures that the progress of the last raidz_reflow_sync() is
 	 * written to disk before raidz_reflow_complete_sync() changes the
 	 * in-memory vre_state.  vdev_raidz_io_start() uses vre_state to
 	 * determine if a reflow is in progress, in which case we may need to
 	 * write to both old and new locations.  Therefore we can only change
 	 * vre_state once this is not necessary, which is once the on-disk
 	 * progress (in spa_ubsync) has been set past any possible writes (to
 	 * the end of the last metaslab).
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 
 	if (!zthr_iscancelled(zthr) &&
 	    vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
 		/*
 		 * We are not being canceled or paused, so the reflow must be
 		 * complete. In that case also mark it as completed on disk.
 		 */
 		ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
 		VERIFY0(dsl_sync_task(spa_name(spa), NULL,
 		    raidz_reflow_complete_sync, spa,
 		    0, ZFS_SPACE_CHECK_NONE));
 		(void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
 	} else {
 		/*
 		 * Wait for all copy zio's to complete and for all the
 		 * raidz_reflow_sync() synctasks to be run.
 		 */
 		spa_history_log_internal(spa, "reflow pause",
 		    NULL, "offset=%llu failed_offset=%lld",
 		    (long long)vre->vre_offset,
 		    (long long)vre->vre_failed_offset);
 		mutex_enter(&vre->vre_lock);
 		if (vre->vre_failed_offset != UINT64_MAX) {
 			/*
 			 * Reset progress so that we will retry everything
 			 * after the point that something failed.
 			 */
 			vre->vre_offset = vre->vre_failed_offset;
 			vre->vre_failed_offset = UINT64_MAX;
 			vre->vre_waiting_for_resilver = B_TRUE;
 		}
 		mutex_exit(&vre->vre_lock);
 	}
 }
 
 void
 spa_start_raidz_expansion_thread(spa_t *spa)
 {
 	ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
 	spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
 	    spa_raidz_expand_thread_check, spa_raidz_expand_thread,
 	    spa, defclsyspri);
 }
 
 void
 raidz_dtl_reassessed(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	if (spa->spa_raidz_expand != NULL) {
 		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 		/*
 		 * we get called often from vdev_dtl_reassess() so make
 		 * sure it's our vdev and any replacing is complete
 		 */
 		if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
 		    !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
 			mutex_enter(&vre->vre_lock);
 			if (vre->vre_waiting_for_resilver) {
 				vdev_dbgmsg(vd, "DTL reassessed, "
 				    "continuing raidz expansion");
 				vre->vre_waiting_for_resilver = B_FALSE;
 				zthr_wakeup(spa->spa_raidz_expand_zthr);
 			}
 			mutex_exit(&vre->vre_lock);
 		}
 	}
 }
 
 int
 vdev_raidz_attach_check(vdev_t *new_child)
 {
 	vdev_t *raidvd = new_child->vdev_parent;
 	uint64_t new_children = raidvd->vdev_children;
 
 	/*
 	 * We use the "boot" space as scratch space to handle overwriting the
 	 * initial part of the vdev.  If it is too small, then this expansion
 	 * is not allowed.  This would be very unusual (e.g. ashift > 13 and
 	 * >200 children).
 	 */
 	if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
 		return (EINVAL);
 	}
 	return (0);
 }
 
 void
 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
 {
 	vdev_t *new_child = arg;
 	spa_t *spa = new_child->vdev_spa;
 	vdev_t *raidvd = new_child->vdev_parent;
 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
 	ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
 	ASSERT3P(raidvd->vdev_top, ==, raidvd);
 	ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
 	ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
 	ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
 	    new_child);
 
 	spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
 
 	vdrz->vd_physical_width++;
 
 	VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
 	vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
 	vdrz->vn_vre.vre_offset = 0;
 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
 	spa->spa_raidz_expand = &vdrz->vn_vre;
 	zthr_wakeup(spa->spa_raidz_expand_zthr);
 
 	/*
 	 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
 	 * written to the config.
 	 */
 	vdev_config_dirty(raidvd);
 
 	vdrz->vn_vre.vre_start_time = gethrestime_sec();
 	vdrz->vn_vre.vre_end_time = 0;
 	vdrz->vn_vre.vre_state = DSS_SCANNING;
 	vdrz->vn_vre.vre_bytes_copied = 0;
 
 	uint64_t state = vdrz->vn_vre.vre_state;
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
 	    sizeof (state), 1, &state, tx));
 
 	uint64_t start_time = vdrz->vn_vre.vre_start_time;
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
 	    sizeof (start_time), 1, &start_time, tx));
 
 	(void) zap_remove(spa->spa_meta_objset,
 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
 	(void) zap_remove(spa->spa_meta_objset,
 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
 
 	spa_history_log_internal(spa, "raidz vdev expansion started",  tx,
 	    "%s vdev %llu new width %llu", spa_name(spa),
 	    (unsigned long long)raidvd->vdev_id,
 	    (unsigned long long)raidvd->vdev_children);
 }
 
 int
 vdev_raidz_load(vdev_t *vd)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	int err;
 
 	uint64_t state = DSS_NONE;
 	uint64_t start_time = 0;
 	uint64_t end_time = 0;
 	uint64_t bytes_copied = 0;
 
 	if (vd->vdev_top_zap != 0) {
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
 		    sizeof (state), 1, &state);
 		if (err != 0 && err != ENOENT)
 			return (err);
 
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
 		    sizeof (start_time), 1, &start_time);
 		if (err != 0 && err != ENOENT)
 			return (err);
 
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
 		    sizeof (end_time), 1, &end_time);
 		if (err != 0 && err != ENOENT)
 			return (err);
 
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
 		    sizeof (bytes_copied), 1, &bytes_copied);
 		if (err != 0 && err != ENOENT)
 			return (err);
 	}
 
 	/*
 	 * If we are in the middle of expansion, vre_state should have
 	 * already been set by vdev_raidz_init().
 	 */
 	EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
 	vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
 	vdrz->vn_vre.vre_start_time = start_time;
 	vdrz->vn_vre.vre_end_time = end_time;
 	vdrz->vn_vre.vre_bytes_copied = bytes_copied;
 
 	return (0);
 }
 
 int
 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
 {
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 
 	if (vre == NULL) {
 		/* no removal in progress; find most recent completed */
 		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
 			vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
 			if (vd->vdev_ops == &vdev_raidz_ops) {
 				vdev_raidz_t *vdrz = vd->vdev_tsd;
 
 				if (vdrz->vn_vre.vre_end_time != 0 &&
 				    (vre == NULL ||
 				    vdrz->vn_vre.vre_end_time >
 				    vre->vre_end_time)) {
 					vre = &vdrz->vn_vre;
 				}
 			}
 		}
 	}
 
 	if (vre == NULL) {
 		return (SET_ERROR(ENOENT));
 	}
 
 	pres->pres_state = vre->vre_state;
 	pres->pres_expanding_vdev = vre->vre_vdev_id;
 
 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
 
 	mutex_enter(&vre->vre_lock);
 	pres->pres_reflowed = vre->vre_bytes_copied;
 	for (int i = 0; i < TXG_SIZE; i++)
 		pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
 	mutex_exit(&vre->vre_lock);
 
 	pres->pres_start_time = vre->vre_start_time;
 	pres->pres_end_time = vre->vre_end_time;
 	pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
 
 	return (0);
 }
 
 /*
  * Initialize private RAIDZ specific fields from the nvlist.
  */
 static int
 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
 {
 	uint_t children;
 	nvlist_t **child;
 	int error = nvlist_lookup_nvlist_array(nv,
 	    ZPOOL_CONFIG_CHILDREN, &child, &children);
 	if (error != 0)
 		return (SET_ERROR(EINVAL));
 
 	uint64_t nparity;
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
 		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
 			return (SET_ERROR(EINVAL));
 
 		/*
 		 * Previous versions could only support 1 or 2 parity
 		 * device.
 		 */
 		if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
 			return (SET_ERROR(EINVAL));
 		else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
 			return (SET_ERROR(EINVAL));
 	} else {
 		/*
 		 * We require the parity to be specified for SPAs that
 		 * support multiple parity levels.
 		 */
 		if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
 			return (SET_ERROR(EINVAL));
 
 		/*
 		 * Otherwise, we default to 1 parity device for RAID-Z.
 		 */
 		nparity = 1;
 	}
 
 	vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
 	vdrz->vn_vre.vre_vdev_id = -1;
 	vdrz->vn_vre.vre_offset = UINT64_MAX;
 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
 	mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
 	zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
 	mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
 	    sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
 
 	vdrz->vd_physical_width = children;
 	vdrz->vd_nparity = nparity;
 
 	/* note, the ID does not exist when creating a pool */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
 	    &vdrz->vn_vre.vre_vdev_id);
 
 	boolean_t reflow_in_progress =
 	    nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
 	if (reflow_in_progress) {
 		spa->spa_raidz_expand = &vdrz->vn_vre;
 		vdrz->vn_vre.vre_state = DSS_SCANNING;
 	}
 
 	vdrz->vd_original_width = children;
 	uint64_t *txgs;
 	unsigned int txgs_size = 0;
 	error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
 	    &txgs, &txgs_size);
 	if (error == 0) {
 		for (int i = 0; i < txgs_size; i++) {
 			reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
 			re->re_txg = txgs[txgs_size - i - 1];
 			re->re_logical_width = vdrz->vd_physical_width - i;
 
 			if (reflow_in_progress)
 				re->re_logical_width--;
 
 			avl_add(&vdrz->vd_expand_txgs, re);
 		}
 
 		vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
 	}
 	if (reflow_in_progress) {
 		vdrz->vd_original_width--;
 		zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
 		    children, txgs_size);
 	}
 
 	*tsd = vdrz;
 
 	return (0);
 }
 
 static void
 vdev_raidz_fini(vdev_t *vd)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
 		vd->vdev_spa->spa_raidz_expand = NULL;
 	reflow_node_t *re;
 	void *cookie = NULL;
 	avl_tree_t *tree = &vdrz->vd_expand_txgs;
 	while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
 		kmem_free(re, sizeof (*re));
 	avl_destroy(&vdrz->vd_expand_txgs);
 	mutex_destroy(&vdrz->vd_expand_lock);
 	mutex_destroy(&vdrz->vn_vre.vre_lock);
 	cv_destroy(&vdrz->vn_vre.vre_cv);
 	zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
 	kmem_free(vdrz, sizeof (*vdrz));
 }
 
 /*
  * Add RAIDZ specific fields to the config nvlist.
  */
 static void
 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
 {
 	ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 
 	/*
 	 * Make sure someone hasn't managed to sneak a fancy new vdev
 	 * into a crufty old storage pool.
 	 */
 	ASSERT(vdrz->vd_nparity == 1 ||
 	    (vdrz->vd_nparity <= 2 &&
 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
 	    (vdrz->vd_nparity <= 3 &&
 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
 
 	/*
 	 * Note that we'll add these even on storage pools where they
 	 * aren't strictly required -- older software will just ignore
 	 * it.
 	 */
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
 
 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
 		fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
 	}
 
 	mutex_enter(&vdrz->vd_expand_lock);
 	if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
 		uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
 		uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
 		    KM_SLEEP);
 		uint64_t i = 0;
 
 		for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
 		    re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
 			txgs[i++] = re->re_txg;
 		}
 
 		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
 		    txgs, count);
 
 		kmem_free(txgs, sizeof (uint64_t) * count);
 	}
 	mutex_exit(&vdrz->vd_expand_lock);
 }
 
 static uint64_t
 vdev_raidz_nparity(vdev_t *vd)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	return (vdrz->vd_nparity);
 }
 
 static uint64_t
 vdev_raidz_ndisks(vdev_t *vd)
 {
 	return (vd->vdev_children);
 }
 
 vdev_ops_t vdev_raidz_ops = {
 	.vdev_op_init = vdev_raidz_init,
 	.vdev_op_fini = vdev_raidz_fini,
 	.vdev_op_open = vdev_raidz_open,
 	.vdev_op_close = vdev_raidz_close,
 	.vdev_op_asize = vdev_raidz_asize,
 	.vdev_op_min_asize = vdev_raidz_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_raidz_io_start,
 	.vdev_op_io_done = vdev_raidz_io_done,
 	.vdev_op_state_change = vdev_raidz_state_change,
 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_raidz_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = vdev_raidz_config_generate,
 	.vdev_op_nparity = vdev_raidz_nparity,
 	.vdev_op_ndisks = vdev_raidz_ndisks,
 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
 	"For testing, pause RAIDZ expansion after reflowing this many bytes");
 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
 	"Max amount of concurrent i/o for RAIDZ expansion");
 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
 	"For expanded RAIDZ, aggregate reads that have more rows than this");
 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
 	"For expanded RAIDZ, automatically start a pool scrub when expansion "
 	"completes");
 /* END CSTYLED */
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index e549e1895f39..1af357c58006 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1,4383 +1,4383 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright (c) 2018 Datto Inc.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/arc.h>
 #include <sys/stat.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/dsl_dataset.h>
 #include <sys/vdev_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/metaslab.h>
 #include <sys/trace_zfs.h>
 #include <sys/abd.h>
 #include <sys/brt.h>
 #include <sys/wmsum.h>
 
 /*
  * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
  * calls that change the file system. Each itx has enough information to
  * be able to replay them after a system crash, power loss, or
  * equivalent failure mode. These are stored in memory until either:
  *
  *   1. they are committed to the pool by the DMU transaction group
  *      (txg), at which point they can be discarded; or
  *   2. they are committed to the on-disk ZIL for the dataset being
  *      modified (e.g. due to an fsync, O_DSYNC, or other synchronous
  *      requirement).
  *
  * In the event of a crash or power loss, the itxs contained by each
  * dataset's on-disk ZIL will be replayed when that dataset is first
  * instantiated (e.g. if the dataset is a normal filesystem, when it is
  * first mounted).
  *
  * As hinted at above, there is one ZIL per dataset (both the in-memory
  * representation, and the on-disk representation). The on-disk format
  * consists of 3 parts:
  *
  * 	- a single, per-dataset, ZIL header; which points to a chain of
  * 	- zero or more ZIL blocks; each of which contains
  * 	- zero or more ZIL records
  *
  * A ZIL record holds the information necessary to replay a single
  * system call transaction. A ZIL block can hold many ZIL records, and
  * the blocks are chained together, similarly to a singly linked list.
  *
  * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL
  * block in the chain, and the ZIL header points to the first block in
  * the chain.
  *
  * Note, there is not a fixed place in the pool to hold these ZIL
  * blocks; they are dynamically allocated and freed as needed from the
  * blocks available on the pool, though they can be preferentially
  * allocated from a dedicated "log" vdev.
  */
 
 /*
  * This controls the amount of time that a ZIL block (lwb) will remain
  * "open" when it isn't "full", and it has a thread waiting for it to be
  * committed to stable storage. Please refer to the zil_commit_waiter()
  * function (and the comments within it) for more details.
  */
 static uint_t zfs_commit_timeout_pct = 10;
 
 /*
  * See zil.h for more information about these fields.
  */
 static zil_kstat_values_t zil_stats = {
 	{ "zil_commit_count",			KSTAT_DATA_UINT64 },
 	{ "zil_commit_writer_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_count",			KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_copied_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_copied_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_needcopy_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_needcopy_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_bytes",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_write",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_alloc",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_bytes",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_write",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_alloc",	KSTAT_DATA_UINT64 },
 };
 
 static zil_sums_t zil_sums_global;
 static kstat_t *zil_kstats_global;
 
 /*
  * Disable intent logging replay.  This global ZIL switch affects all pools.
  */
 int zil_replay_disable = 0;
 
 /*
  * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
  * the disk(s) by the ZIL after an LWB write has completed. Setting this
  * will cause ZIL corruption on power loss if a volatile out-of-order
  * write cache is enabled.
  */
 static int zil_nocacheflush = 0;
 
 /*
  * Limit SLOG write size per commit executed with synchronous priority.
  * Any writes above that will be executed with lower (asynchronous) priority
  * to limit potential SLOG device abuse by single active ZIL writer.
  */
 static uint64_t zil_slog_bulk = 64 * 1024 * 1024;
 
 static kmem_cache_t *zil_lwb_cache;
 static kmem_cache_t *zil_zcw_cache;
 
 static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
 static itx_t *zil_itx_clone(itx_t *oitx);
 static uint64_t zil_max_waste_space(zilog_t *zilog);
 
 static int
 zil_bp_compare(const void *x1, const void *x2)
 {
 	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
 	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
 
 	int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
 	if (likely(cmp))
 		return (cmp);
 
 	return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
 }
 
 static void
 zil_bp_tree_init(zilog_t *zilog)
 {
 	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
 	    sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
 }
 
 static void
 zil_bp_tree_fini(zilog_t *zilog)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	zil_bp_node_t *zn;
 	void *cookie = NULL;
 
 	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
 		kmem_free(zn, sizeof (zil_bp_node_t));
 
 	avl_destroy(t);
 }
 
 int
 zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	const dva_t *dva;
 	zil_bp_node_t *zn;
 	avl_index_t where;
 
 	if (BP_IS_EMBEDDED(bp))
 		return (0);
 
 	dva = BP_IDENTITY(bp);
 
 	if (avl_find(t, dva, &where) != NULL)
 		return (SET_ERROR(EEXIST));
 
 	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
 	zn->zn_dva = *dva;
 	avl_insert(t, zn, where);
 
 	return (0);
 }
 
 static zil_header_t *
 zil_header_in_syncing_context(zilog_t *zilog)
 {
 	return ((zil_header_t *)zilog->zl_header);
 }
 
 static void
 zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
 {
 	zio_cksum_t *zc = &bp->blk_cksum;
 
 	(void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_0],
 	    sizeof (zc->zc_word[ZIL_ZC_GUID_0]));
 	(void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_1],
 	    sizeof (zc->zc_word[ZIL_ZC_GUID_1]));
 	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
 	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
 }
 
 static int
 zil_kstats_global_update(kstat_t *ksp, int rw)
 {
 	zil_kstat_values_t *zs = ksp->ks_data;
 	ASSERT3P(&zil_stats, ==, zs);
 
 	if (rw == KSTAT_WRITE) {
 		return (SET_ERROR(EACCES));
 	}
 
 	zil_kstat_values_update(zs, &zil_sums_global);
 
 	return (0);
 }
 
 /*
  * Read a log block and make sure it's valid.
  */
 static int
 zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
     blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf)
 {
 	zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	if (!decrypt)
 		zio_flags |= ZIO_FLAG_RAW;
 
 	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
 	    abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		zio_cksum_t cksum = bp->blk_cksum;
 
 		/*
 		 * Validate the checksummed log block.
 		 *
 		 * Sequence numbers should be... sequential.  The checksum
 		 * verifier for the next block should be bp's checksum plus 1.
 		 *
 		 * Also check the log chain linkage and size used.
 		 */
 		cksum.zc_word[ZIL_ZC_SEQ]++;
 
 		uint64_t size = BP_GET_LSIZE(bp);
 		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t *zilc = (*abuf)->b_data;
 			char *lr = (char *)(zilc + 1);
 
 			if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) ||
 			    zilc->zc_nused < sizeof (*zilc) ||
 			    zilc->zc_nused > size) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				*begin = lr;
 				*end = lr + zilc->zc_nused - sizeof (*zilc);
 				*nbp = zilc->zc_next_blk;
 			}
 		} else {
 			char *lr = (*abuf)->b_data;
 			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
 
 			if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) ||
 			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				*begin = lr;
 				*end = lr + zilc->zc_nused;
 				*nbp = zilc->zc_next_blk;
 			}
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Read a TX_WRITE log data block.
  */
 static int
 zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
 {
 	zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 	const blkptr_t *bp = &lr->lr_blkptr;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	arc_buf_t *abuf = NULL;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (BP_IS_HOLE(bp)) {
 		if (wbuf != NULL)
 			memset(wbuf, 0, MAX(BP_GET_LSIZE(bp), lr->lr_length));
 		return (0);
 	}
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	/*
 	 * If we are not using the resulting data, we are just checking that
 	 * it hasn't been corrupted so we don't need to waste CPU time
 	 * decompressing and decrypting it.
 	 */
 	if (wbuf == NULL)
 		zio_flags |= ZIO_FLAG_RAW;
 
 	ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
 	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
 	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
 	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		if (wbuf != NULL)
 			memcpy(wbuf, abuf->b_data, arc_buf_size(abuf));
 		arc_buf_destroy(abuf, &abuf);
 	}
 
 	return (error);
 }
 
 void
 zil_sums_init(zil_sums_t *zs)
 {
 	wmsum_init(&zs->zil_commit_count, 0);
 	wmsum_init(&zs->zil_commit_writer_count, 0);
 	wmsum_init(&zs->zil_itx_count, 0);
 	wmsum_init(&zs->zil_itx_indirect_count, 0);
 	wmsum_init(&zs->zil_itx_indirect_bytes, 0);
 	wmsum_init(&zs->zil_itx_copied_count, 0);
 	wmsum_init(&zs->zil_itx_copied_bytes, 0);
 	wmsum_init(&zs->zil_itx_needcopy_count, 0);
 	wmsum_init(&zs->zil_itx_needcopy_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_count, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_write, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_count, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_write, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0);
 }
 
 void
 zil_sums_fini(zil_sums_t *zs)
 {
 	wmsum_fini(&zs->zil_commit_count);
 	wmsum_fini(&zs->zil_commit_writer_count);
 	wmsum_fini(&zs->zil_itx_count);
 	wmsum_fini(&zs->zil_itx_indirect_count);
 	wmsum_fini(&zs->zil_itx_indirect_bytes);
 	wmsum_fini(&zs->zil_itx_copied_count);
 	wmsum_fini(&zs->zil_itx_copied_bytes);
 	wmsum_fini(&zs->zil_itx_needcopy_count);
 	wmsum_fini(&zs->zil_itx_needcopy_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_count);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_write);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_alloc);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_count);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_write);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_alloc);
 }
 
 void
 zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums)
 {
 	zs->zil_commit_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_commit_count);
 	zs->zil_commit_writer_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_commit_writer_count);
 	zs->zil_itx_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_count);
 	zs->zil_itx_indirect_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_indirect_count);
 	zs->zil_itx_indirect_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_indirect_bytes);
 	zs->zil_itx_copied_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_copied_count);
 	zs->zil_itx_copied_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_copied_bytes);
 	zs->zil_itx_needcopy_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_needcopy_count);
 	zs->zil_itx_needcopy_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_needcopy_bytes);
 	zs->zil_itx_metaslab_normal_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_count);
 	zs->zil_itx_metaslab_normal_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes);
 	zs->zil_itx_metaslab_normal_write.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_write);
 	zs->zil_itx_metaslab_normal_alloc.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc);
 	zs->zil_itx_metaslab_slog_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_count);
 	zs->zil_itx_metaslab_slog_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes);
 	zs->zil_itx_metaslab_slog_write.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_write);
 	zs->zil_itx_metaslab_slog_alloc.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc);
 }
 
 /*
  * Parse the intent log, and call parse_func for each valid record within.
  */
 int
 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg,
     boolean_t decrypt)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	boolean_t claimed = !!zh->zh_claim_txg;
 	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
 	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
 	uint64_t max_blk_seq = 0;
 	uint64_t max_lr_seq = 0;
 	uint64_t blk_count = 0;
 	uint64_t lr_count = 0;
 	blkptr_t blk, next_blk = {{{{0}}}};
 	int error = 0;
 
 	/*
 	 * Old logs didn't record the maximum zh_claim_lr_seq.
 	 */
 	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		claim_lr_seq = UINT64_MAX;
 
 	/*
 	 * Starting at the block pointed to by zh_log we read the log chain.
 	 * For each block in the chain we strongly check that block to
 	 * ensure its validity.  We stop when an invalid block is found.
 	 * For each block pointer in the chain we call parse_blk_func().
 	 * For each record in each valid block we call parse_lr_func().
 	 * If the log has been claimed, stop if we encounter a sequence
 	 * number greater than the highest claimed sequence number.
 	 */
 	zil_bp_tree_init(zilog);
 
 	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
 		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
 		int reclen;
 		char *lrp, *end;
 		arc_buf_t *abuf = NULL;
 
 		if (blk_seq > claim_blk_seq)
 			break;
 
 		error = parse_blk_func(zilog, &blk, arg, txg);
 		if (error != 0)
 			break;
 		ASSERT3U(max_blk_seq, <, blk_seq);
 		max_blk_seq = blk_seq;
 		blk_count++;
 
 		if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
 			break;
 
 		error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
 		    &lrp, &end, &abuf);
 		if (error != 0) {
 			if (abuf)
 				arc_buf_destroy(abuf, &abuf);
 			if (claimed) {
 				char name[ZFS_MAX_DATASET_NAME_LEN];
 
 				dmu_objset_name(zilog->zl_os, name);
 
 				cmn_err(CE_WARN, "ZFS read log block error %d, "
 				    "dataset %s, seq 0x%llx\n", error, name,
 				    (u_longlong_t)blk_seq);
 			}
 			break;
 		}
 
 		for (; lrp < end; lrp += reclen) {
 			lr_t *lr = (lr_t *)lrp;
 			reclen = lr->lrc_reclen;
 			ASSERT3U(reclen, >=, sizeof (lr_t));
 			ASSERT3U(reclen, <=, end - lrp);
 			if (lr->lrc_seq > claim_lr_seq) {
 				arc_buf_destroy(abuf, &abuf);
 				goto done;
 			}
 
 			error = parse_lr_func(zilog, lr, arg, txg);
 			if (error != 0) {
 				arc_buf_destroy(abuf, &abuf);
 				goto done;
 			}
 			ASSERT3U(max_lr_seq, <, lr->lrc_seq);
 			max_lr_seq = lr->lrc_seq;
 			lr_count++;
 		}
 		arc_buf_destroy(abuf, &abuf);
 	}
 done:
 	zilog->zl_parse_error = error;
 	zilog->zl_parse_blk_seq = max_blk_seq;
 	zilog->zl_parse_lr_seq = max_lr_seq;
 	zilog->zl_parse_blk_count = blk_count;
 	zilog->zl_parse_lr_count = lr_count;
 
 	zil_bp_tree_fini(zilog);
 
 	return (error);
 }
 
 static int
 zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t first_txg)
 {
 	(void) tx;
 	ASSERT(!BP_IS_HOLE(bp));
 
 	/*
 	 * As we call this function from the context of a rewind to a
 	 * checkpoint, each ZIL block whose txg is later than the txg
 	 * that we rewind to is invalid. Thus, we return -1 so
 	 * zil_parse() doesn't attempt to read it.
 	 */
-	if (bp->blk_birth >= first_txg)
+	if (BP_GET_LOGICAL_BIRTH(bp) >= first_txg)
 		return (-1);
 
 	if (zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	zio_free(zilog->zl_spa, first_txg, bp);
 	return (0);
 }
 
 static int
 zil_noop_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t first_txg)
 {
 	(void) zilog, (void) lrc, (void) tx, (void) first_txg;
 	return (0);
 }
 
 static int
 zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t first_txg)
 {
 	/*
 	 * Claim log block if not already committed and not already claimed.
 	 * If tx == NULL, just verify that the block is claimable.
 	 */
-	if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
+	if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) < first_txg ||
 	    zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
 	    tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
 }
 
 static int
 zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	int error;
 
 	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 
 	/*
 	 * If the block is not readable, don't claim it.  This can happen
 	 * in normal operation when a log block is written to disk before
 	 * some of the dmu_sync() blocks it points to.  In this case, the
 	 * transaction cannot have been committed to anyone (we would have
 	 * waited for all writes to be stable first), so it is semantically
 	 * correct to declare this the end of the log.
 	 */
-	if (lr->lr_blkptr.blk_birth >= first_txg) {
+	if (BP_GET_LOGICAL_BIRTH(&lr->lr_blkptr) >= first_txg) {
 		error = zil_read_log_data(zilog, lr, NULL);
 		if (error != 0)
 			return (error);
 	}
 
 	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
 }
 
 static int
 zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t first_txg)
 {
 	const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
 	const blkptr_t *bp;
 	spa_t *spa = zilog->zl_spa;
 	uint_t ii;
 
 	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 	ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
 	    lr_bps[lr->lr_nbps]));
 
 	if (tx == NULL) {
 		return (0);
 	}
 
 	/*
 	 * XXX: Do we need to byteswap lr?
 	 */
 
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 
 		/*
 		 * When data is embedded into the BP there is no need to create
 		 * BRT entry as there is no data block.  Just copy the BP as it
 		 * contains the data.
 		 */
 		if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 			continue;
 
 		/*
 		 * We can not handle block pointers from the future, since they
 		 * are not yet allocated.  It should not normally happen, but
 		 * just in case lets be safe and just stop here now instead of
 		 * corrupting the pool.
 		 */
-		if (BP_PHYSICAL_BIRTH(bp) >= first_txg)
+		if (BP_GET_BIRTH(bp) >= first_txg)
 			return (SET_ERROR(ENOENT));
 
 		/*
 		 * Assert the block is really allocated before we reference it.
 		 */
 		metaslab_check_free(spa, bp);
 	}
 
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp))
 			brt_pending_add(spa, bp, tx);
 	}
 
 	return (0);
 }
 
 static int
 zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t first_txg)
 {
 
 	switch (lrc->lrc_txtype) {
 	case TX_WRITE:
 		return (zil_claim_write(zilog, lrc, tx, first_txg));
 	case TX_CLONE_RANGE:
 		return (zil_claim_clone_range(zilog, lrc, tx, first_txg));
 	default:
 		return (0);
 	}
 }
 
 static int
 zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t claim_txg)
 {
 	(void) claim_txg;
 
 	zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
 	return (0);
 }
 
 static int
 zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	blkptr_t *bp = &lr->lr_blkptr;
 
 	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 
 	/*
 	 * If we previously claimed it, we need to free it.
 	 */
-	if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
-	    !BP_IS_HOLE(bp)) {
+	if (BP_GET_LOGICAL_BIRTH(bp) >= claim_txg &&
+	    zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) {
 		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 	}
 
 	return (0);
 }
 
 static int
 zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
 {
 	const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
 	const blkptr_t *bp;
 	spa_t *spa;
 	uint_t ii;
 
 	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 	ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
 	    lr_bps[lr->lr_nbps]));
 
 	if (tx == NULL) {
 		return (0);
 	}
 
 	spa = zilog->zl_spa;
 
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 
 		if (!BP_IS_HOLE(bp)) {
 			zio_free(spa, dmu_tx_get_txg(tx), bp);
 		}
 	}
 
 	return (0);
 }
 
 static int
 zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t claim_txg)
 {
 
 	if (claim_txg == 0) {
 		return (0);
 	}
 
 	switch (lrc->lrc_txtype) {
 	case TX_WRITE:
 		return (zil_free_write(zilog, lrc, tx, claim_txg));
 	case TX_CLONE_RANGE:
 		return (zil_free_clone_range(zilog, lrc, tx));
 	default:
 		return (0);
 	}
 }
 
 static int
 zil_lwb_vdev_compare(const void *x1, const void *x2)
 {
 	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
 	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
 
 	return (TREE_CMP(v1, v2));
 }
 
 /*
  * Allocate a new lwb.  We may already have a block pointer for it, in which
  * case we get size and version from there.  Or we may not yet, in which case
  * we choose them here and later make the block allocation match.
  */
 static lwb_t *
 zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
     uint64_t txg, lwb_state_t state)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 	lwb->lwb_zilog = zilog;
 	if (bp) {
 		lwb->lwb_blk = *bp;
 		lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2);
 		sz = BP_GET_LSIZE(bp);
 	} else {
 		BP_ZERO(&lwb->lwb_blk);
 		lwb->lwb_slim = (spa_version(zilog->zl_spa) >=
 		    SPA_VERSION_SLIM_ZIL);
 	}
 	lwb->lwb_slog = slog;
 	lwb->lwb_error = 0;
 	if (lwb->lwb_slim) {
 		lwb->lwb_nmax = sz;
 		lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
 	} else {
 		lwb->lwb_nmax = sz - sizeof (zil_chain_t);
 		lwb->lwb_nused = lwb->lwb_nfilled = 0;
 	}
 	lwb->lwb_sz = sz;
 	lwb->lwb_state = state;
 	lwb->lwb_buf = zio_buf_alloc(sz);
 	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
 	lwb->lwb_root_zio = NULL;
 	lwb->lwb_issued_timestamp = 0;
 	lwb->lwb_issued_txg = 0;
 	lwb->lwb_alloc_txg = txg;
 	lwb->lwb_max_txg = 0;
 
 	mutex_enter(&zilog->zl_lock);
 	list_insert_tail(&zilog->zl_lwb_list, lwb);
 	if (state != LWB_STATE_NEW)
 		zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
 
 	return (lwb);
 }
 
 static void
 zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 	    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 	ASSERT3P(lwb->lwb_child_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_write_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_root_zio, ==, NULL);
 	ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa));
 	ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
 	VERIFY(list_is_empty(&lwb->lwb_itxs));
 	VERIFY(list_is_empty(&lwb->lwb_waiters));
 	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
 
 	/*
 	 * Clear the zilog's field to indicate this lwb is no longer
 	 * valid, and prevent use-after-free errors.
 	 */
 	if (zilog->zl_last_lwb_opened == lwb)
 		zilog->zl_last_lwb_opened = NULL;
 
 	kmem_cache_free(zil_lwb_cache, lwb);
 }
 
 /*
  * Called when we create in-memory log transactions so that we know
  * to cleanup the itxs at the end of spa_sync().
  */
 static void
 zilog_dirty(zilog_t *zilog, uint64_t txg)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	if (ds->ds_is_snapshot)
 		panic("dirtying snapshot!");
 
 	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, zilog);
 
 		zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg);
 	}
 }
 
 /*
  * Determine if the zil is dirty in the specified txg. Callers wanting to
  * ensure that the dirty state does not change must hold the itxg_lock for
  * the specified txg. Holding the lock will ensure that the zil cannot be
  * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
  * state.
  */
 static boolean_t __maybe_unused
 zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 
 	if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
 		return (B_TRUE);
 	return (B_FALSE);
 }
 
 /*
  * Determine if the zil is dirty. The zil is considered dirty if it has
  * any pending itx records that have not been cleaned by zil_clean().
  */
 static boolean_t
 zilog_is_dirty(zilog_t *zilog)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 
 	for (int t = 0; t < TXG_SIZE; t++) {
 		if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Its called in zil_commit context (zil_process_commit_list()/zil_create()).
  * It activates SPA_FEATURE_ZILSAXATTR feature, if its enabled.
  * Check dsl_dataset_feature_is_active to avoid txg_wait_synced() on every
  * zil_commit.
  */
 static void
 zil_commit_activate_saxattr_feature(zilog_t *zilog)
 {
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 	uint64_t txg = 0;
 	dmu_tx_t *tx = NULL;
 
 	if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
 	    dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL &&
 	    !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) {
 		tx = dmu_tx_create(zilog->zl_os);
 		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 		dsl_dataset_dirty(ds, tx);
 		txg = dmu_tx_get_txg(tx);
 
 		mutex_enter(&ds->ds_lock);
 		ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
 		    (void *)B_TRUE;
 		mutex_exit(&ds->ds_lock);
 		dmu_tx_commit(tx);
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 	}
 }
 
 /*
  * Create an on-disk intent log.
  */
 static lwb_t *
 zil_create(zilog_t *zilog)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb = NULL;
 	uint64_t txg = 0;
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
 	int error = 0;
 	boolean_t slog = FALSE;
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	ASSERT(zh->zh_claim_txg == 0);
 	ASSERT(zh->zh_replay_seq == 0);
 
 	blk = zh->zh_log;
 
 	/*
 	 * Allocate an initial log block if:
 	 *    - there isn't one already
 	 *    - the existing block is the wrong endianness
 	 */
 	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
 		tx = dmu_tx_create(zilog->zl_os);
 		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		txg = dmu_tx_get_txg(tx);
 
 		if (!BP_IS_HOLE(&blk)) {
 			zio_free(zilog->zl_spa, txg, &blk);
 			BP_ZERO(&blk);
 		}
 
 		error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
 		    ZIL_MIN_BLKSZ, &slog);
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
 	}
 
 	/*
 	 * Allocate a log write block (lwb) for the first log block.
 	 */
 	if (error == 0)
 		lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
 	 * and wait for zil_sync() to stuff the block pointer into zh_log.
 	 * (zh is part of the MOS, so we cannot modify it in open context.)
 	 */
 	if (tx != NULL) {
 		/*
 		 * If "zilsaxattr" feature is enabled on zpool, then activate
 		 * it now when we're creating the ZIL chain. We can't wait with
 		 * this until we write the first xattr log record because we
 		 * need to wait for the feature activation to sync out.
 		 */
 		if (spa_feature_is_enabled(zilog->zl_spa,
 		    SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) !=
 		    DMU_OST_ZVOL) {
 			mutex_enter(&ds->ds_lock);
 			ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
 			    (void *)B_TRUE;
 			mutex_exit(&ds->ds_lock);
 		}
 
 		dmu_tx_commit(tx);
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 	} else {
 		/*
 		 * This branch covers the case where we enable the feature on a
 		 * zpool that has existing ZIL headers.
 		 */
 		zil_commit_activate_saxattr_feature(zilog);
 	}
 	IMPLY(spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
 	    dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL,
 	    dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR));
 
 	ASSERT(error != 0 || memcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
 	IMPLY(error == 0, lwb != NULL);
 
 	return (lwb);
 }
 
 /*
  * In one tx, free all log blocks and clear the log header. If keep_first
  * is set, then we're replaying a log with no content. We want to keep the
  * first block, however, so that the first synchronous transaction doesn't
  * require a txg_wait_synced() in zil_create(). We don't need to
  * txg_wait_synced() here either when keep_first is set, because both
  * zil_create() and zil_destroy() will wait for any in-progress destroys
  * to complete.
  * Return B_TRUE if there were any entries to replay.
  */
 boolean_t
 zil_destroy(zilog_t *zilog, boolean_t keep_first)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb;
 	dmu_tx_t *tx;
 	uint64_t txg;
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	zilog->zl_old_header = *zh;		/* debugging aid */
 
 	if (BP_IS_HOLE(&zh->zh_log))
 		return (B_FALSE);
 
 	tx = dmu_tx_create(zilog->zl_os);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	txg = dmu_tx_get_txg(tx);
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT3U(zilog->zl_destroy_txg, <, txg);
 	zilog->zl_destroy_txg = txg;
 	zilog->zl_keep_first = keep_first;
 
 	if (!list_is_empty(&zilog->zl_lwb_list)) {
 		ASSERT(zh->zh_claim_txg == 0);
 		VERIFY(!keep_first);
 		while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) {
 			if (lwb->lwb_buf != NULL)
 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 			if (!BP_IS_HOLE(&lwb->lwb_blk))
 				zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
 			zil_free_lwb(zilog, lwb);
 		}
 	} else if (!keep_first) {
 		zil_destroy_sync(zilog, tx);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	dmu_tx_commit(tx);
 
 	return (B_TRUE);
 }
 
 void
 zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	(void) zil_parse(zilog, zil_free_log_block,
 	    zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE);
 }
 
 int
 zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 {
 	dmu_tx_t *tx = txarg;
 	zilog_t *zilog;
 	uint64_t first_txg;
 	zil_header_t *zh;
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_own_obj(dp, ds->ds_object,
 	    DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os);
 	if (error != 0) {
 		/*
 		 * EBUSY indicates that the objset is inconsistent, in which
 		 * case it can not have a ZIL.
 		 */
 		if (error != EBUSY) {
 			cmn_err(CE_WARN, "can't open objset for %llu, error %u",
 			    (unsigned long long)ds->ds_object, error);
 		}
 
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	zh = zil_header_in_syncing_context(zilog);
 	ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
 	first_txg = spa_min_claim_txg(zilog->zl_spa);
 
 	/*
 	 * If the spa_log_state is not set to be cleared, check whether
 	 * the current uberblock is a checkpoint one and if the current
 	 * header has been claimed before moving on.
 	 *
 	 * If the current uberblock is a checkpointed uberblock then
 	 * one of the following scenarios took place:
 	 *
 	 * 1] We are currently rewinding to the checkpoint of the pool.
 	 * 2] We crashed in the middle of a checkpoint rewind but we
 	 *    did manage to write the checkpointed uberblock to the
 	 *    vdev labels, so when we tried to import the pool again
 	 *    the checkpointed uberblock was selected from the import
 	 *    procedure.
 	 *
 	 * In both cases we want to zero out all the ZIL blocks, except
 	 * the ones that have been claimed at the time of the checkpoint
 	 * (their zh_claim_txg != 0). The reason is that these blocks
 	 * may be corrupted since we may have reused their locations on
 	 * disk after we took the checkpoint.
 	 *
 	 * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
 	 * when we first figure out whether the current uberblock is
 	 * checkpointed or not. Unfortunately, that would discard all
 	 * the logs, including the ones that are claimed, and we would
 	 * leak space.
 	 */
 	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
 	    (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
 	    zh->zh_claim_txg == 0)) {
 		if (!BP_IS_HOLE(&zh->zh_log)) {
 			(void) zil_parse(zilog, zil_clear_log_block,
 			    zil_noop_log_record, tx, first_txg, B_FALSE);
 		}
 		BP_ZERO(&zh->zh_log);
 		if (os->os_encrypted)
 			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 		dmu_objset_disown(os, B_FALSE, FTAG);
 		return (0);
 	}
 
 	/*
 	 * If we are not rewinding and opening the pool normally, then
 	 * the min_claim_txg should be equal to the first txg of the pool.
 	 */
 	ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
 
 	/*
 	 * Claim all log blocks if we haven't already done so, and remember
 	 * the highest claimed sequence number.  This ensures that if we can
 	 * read only part of the log now (e.g. due to a missing device),
 	 * but we can read the entire log later, we will not try to replay
 	 * or destroy beyond the last block we successfully claimed.
 	 */
 	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
 	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
 		(void) zil_parse(zilog, zil_claim_log_block,
 		    zil_claim_log_record, tx, first_txg, B_FALSE);
 		zh->zh_claim_txg = first_txg;
 		zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
 		zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
 		if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
 			zh->zh_flags |= ZIL_REPLAY_NEEDED;
 		zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
 		if (os->os_encrypted)
 			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 	}
 
 	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
 	dmu_objset_disown(os, B_FALSE, FTAG);
 	return (0);
 }
 
 /*
  * Check the log by walking the log chain.
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
 int
 zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 {
 	(void) dp;
 	zilog_t *zilog;
 	objset_t *os;
 	blkptr_t *bp;
 	int error;
 
 	ASSERT(tx == NULL);
 
 	error = dmu_objset_from_ds(ds, &os);
 	if (error != 0) {
 		cmn_err(CE_WARN, "can't open objset %llu, error %d",
 		    (unsigned long long)ds->ds_object, error);
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	bp = (blkptr_t *)&zilog->zl_header->zh_log;
 
 	if (!BP_IS_HOLE(bp)) {
 		vdev_t *vd;
 		boolean_t valid = B_TRUE;
 
 		/*
 		 * Check the first block and determine if it's on a log device
 		 * which may have been removed or faulted prior to loading this
 		 * pool.  If so, there's no point in checking the rest of the
 		 * log as its content should have already been synced to the
 		 * pool.
 		 */
 		spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
 		vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
 		if (vd->vdev_islog && vdev_is_dead(vd))
 			valid = vdev_log_state_valid(vd);
 		spa_config_exit(os->os_spa, SCL_STATE, FTAG);
 
 		if (!valid)
 			return (0);
 
 		/*
 		 * Check whether the current uberblock is checkpointed (e.g.
 		 * we are rewinding) and whether the current header has been
 		 * claimed or not. If it hasn't then skip verifying it. We
 		 * do this because its ZIL blocks may be part of the pool's
 		 * state before the rewind, which is no longer valid.
 		 */
 		zil_header_t *zh = zil_header_in_syncing_context(zilog);
 		if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
 		    zh->zh_claim_txg == 0)
 			return (0);
 	}
 
 	/*
 	 * Because tx == NULL, zil_claim_log_block() will not actually claim
 	 * any blocks, but just determine whether it is possible to do so.
 	 * In addition to checking the log chain, zil_claim_log_block()
 	 * will invoke zio_claim() with a done func of spa_claim_notify(),
 	 * which will update spa_max_claim_txg.  See spa_load() for details.
 	 */
 	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
 	    zilog->zl_header->zh_claim_txg ? -1ULL :
 	    spa_min_claim_txg(os->os_spa), B_FALSE);
 
 	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
 
 /*
  * When an itx is "skipped", this function is used to properly mark the
  * waiter as "done, and signal any thread(s) waiting on it. An itx can
  * be skipped (and not committed to an lwb) for a variety of reasons,
  * one of them being that the itx was committed via spa_sync(), prior to
  * it being committed to an lwb; this can happen if a thread calling
  * zil_commit() is racing with spa_sync().
  */
 static void
 zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
 {
 	mutex_enter(&zcw->zcw_lock);
 	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 	zcw->zcw_done = B_TRUE;
 	cv_broadcast(&zcw->zcw_cv);
 	mutex_exit(&zcw->zcw_lock);
 }
 
 /*
  * This function is used when the given waiter is to be linked into an
  * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb.
  * At this point, the waiter will no longer be referenced by the itx,
  * and instead, will be referenced by the lwb.
  */
 static void
 zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
 {
 	/*
 	 * The lwb_waiters field of the lwb is protected by the zilog's
 	 * zl_issuer_lock while the lwb is open and zl_lock otherwise.
 	 * zl_issuer_lock also protects leaving the open state.
 	 * zcw_lwb setting is protected by zl_issuer_lock and state !=
 	 * flush_done, which transition is protected by zl_lock.
 	 */
 	ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock));
 	IMPLY(lwb->lwb_state != LWB_STATE_OPENED,
 	    MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	list_insert_tail(&lwb->lwb_waiters, zcw);
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	zcw->zcw_lwb = lwb;
 }
 
 /*
  * This function is used when zio_alloc_zil() fails to allocate a ZIL
  * block, and the given waiter must be linked to the "nolwb waiters"
  * list inside of zil_process_commit_list().
  */
 static void
 zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
 {
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	list_insert_tail(nolwb, zcw);
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 }
 
 void
 zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
 {
 	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	avl_index_t where;
 	zil_vdev_node_t *zv, zvsearch;
 	int ndvas = BP_GET_NDVAS(bp);
 	int i;
 
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	if (zil_nocacheflush)
 		return;
 
 	mutex_enter(&lwb->lwb_vdev_lock);
 	for (i = 0; i < ndvas; i++) {
 		zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 		if (avl_find(t, &zvsearch, &where) == NULL) {
 			zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
 			zv->zv_vdev = zvsearch.zv_vdev;
 			avl_insert(t, zv, where);
 		}
 	}
 	mutex_exit(&lwb->lwb_vdev_lock);
 }
 
 static void
 zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
 {
 	avl_tree_t *src = &lwb->lwb_vdev_tree;
 	avl_tree_t *dst = &nlwb->lwb_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
 	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	/*
 	 * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
 	 * not need the protection of lwb_vdev_lock (it will only be modified
 	 * while holding zilog->zl_lock) as its writes and those of its
 	 * children have all completed.  The younger 'nlwb' may be waiting on
 	 * future writes to additional vdevs.
 	 */
 	mutex_enter(&nlwb->lwb_vdev_lock);
 	/*
 	 * Tear down the 'lwb' vdev tree, ensuring that entries which do not
 	 * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
 	 */
 	while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
 		avl_index_t where;
 
 		if (avl_find(dst, zv, &where) == NULL) {
 			avl_insert(dst, zv, where);
 		} else {
 			kmem_free(zv, sizeof (*zv));
 		}
 	}
 	mutex_exit(&nlwb->lwb_vdev_lock);
 }
 
 void
 zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
 {
 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
 }
 
 /*
  * This function is a called after all vdevs associated with a given lwb
  * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
  * as the lwb write completes, if "zil_nocacheflush" is set. Further,
  * all "previous" lwb's will have completed before this function is
  * called; i.e. this function is called for all previous lwbs before
  * it's called for "this" lwb (enforced via zio the dependencies
  * configured in zil_lwb_set_zio_dependency()).
  *
  * The intention is for this function to be called as soon as the
  * contents of an lwb are considered "stable" on disk, and will survive
  * any sudden loss of power. At this point, any threads waiting for the
  * lwb to reach this state are signalled, and the "waiter" structures
  * are marked "done".
  */
 static void
 zil_lwb_flush_vdevs_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
 	zilog_t *zilog = lwb->lwb_zilog;
 	zil_commit_waiter_t *zcw;
 	itx_t *itx;
 
 	spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
 
 	hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp;
 
 	mutex_enter(&zilog->zl_lock);
 
 	zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8;
 
 	lwb->lwb_root_zio = NULL;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	lwb->lwb_state = LWB_STATE_FLUSH_DONE;
 
 	if (zilog->zl_last_lwb_opened == lwb) {
 		/*
 		 * Remember the highest committed log sequence number
 		 * for ztest. We only update this value when all the log
 		 * writes succeeded, because ztest wants to ASSERT that
 		 * it got the whole log chain.
 		 */
 		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
 	}
 
 	while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
 		zil_itx_destroy(itx);
 
 	while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
 		mutex_enter(&zcw->zcw_lock);
 
 		ASSERT3P(zcw->zcw_lwb, ==, lwb);
 		zcw->zcw_lwb = NULL;
 		/*
 		 * We expect any ZIO errors from child ZIOs to have been
 		 * propagated "up" to this specific LWB's root ZIO, in
 		 * order for this error handling to work correctly. This
 		 * includes ZIO errors from either this LWB's write or
 		 * flush, as well as any errors from other dependent LWBs
 		 * (e.g. a root LWB ZIO that might be a child of this LWB).
 		 *
 		 * With that said, it's important to note that LWB flush
 		 * errors are not propagated up to the LWB root ZIO.
 		 * This is incorrect behavior, and results in VDEV flush
 		 * errors not being handled correctly here. See the
 		 * comment above the call to "zio_flush" for details.
 		 */
 
 		zcw->zcw_zio_error = zio->io_error;
 
 		ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 		zcw->zcw_done = B_TRUE;
 		cv_broadcast(&zcw->zcw_cv);
 
 		mutex_exit(&zcw->zcw_lock);
 	}
 
 	uint64_t txg = lwb->lwb_issued_txg;
 
 	/* Once we drop the lock, lwb may be freed by zil_sync(). */
 	mutex_exit(&zilog->zl_lock);
 
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0);
 	zilog->zl_lwb_inflight[txg & TXG_MASK]--;
 	if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0)
 		cv_broadcast(&zilog->zl_lwb_io_cv);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 }
 
 /*
  * Wait for the completion of all issued write/flush of that txg provided.
  * It guarantees zil_lwb_flush_vdevs_done() is called and returned.
  */
 static void
 zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg)
 {
 	ASSERT3U(txg, ==, spa_syncing_txg(zilog->zl_spa));
 
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	while (zilog->zl_lwb_inflight[txg & TXG_MASK] > 0)
 		cv_wait(&zilog->zl_lwb_io_cv, &zilog->zl_lwb_io_lock);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 
 #ifdef ZFS_DEBUG
 	mutex_enter(&zilog->zl_lock);
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	lwb_t *lwb = list_head(&zilog->zl_lwb_list);
 	while (lwb != NULL) {
 		if (lwb->lwb_issued_txg <= txg) {
 			ASSERT(lwb->lwb_state != LWB_STATE_ISSUED);
 			ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE);
 			IMPLY(lwb->lwb_issued_txg > 0,
 			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 		}
 		IMPLY(lwb->lwb_state == LWB_STATE_WRITE_DONE ||
 		    lwb->lwb_state == LWB_STATE_FLUSH_DONE,
 		    lwb->lwb_buf == NULL);
 		lwb = list_next(&zilog->zl_lwb_list, lwb);
 	}
 	mutex_exit(&zilog->zl_lwb_io_lock);
 	mutex_exit(&zilog->zl_lock);
 #endif
 }
 
 /*
  * This is called when an lwb's write zio completes. The callback's
  * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
  * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
  * in writing out this specific lwb's data, and in the case that cache
  * flushes have been deferred, vdevs involved in writing the data for
  * previous lwbs. The writes corresponding to all the vdevs in the
  * lwb_vdev_tree will have completed by the time this is called, due to
  * the zio dependencies configured in zil_lwb_set_zio_dependency(),
  * which takes deferred flushes into account. The lwb will be "done"
  * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
  * completion callback for the lwb's root zio.
  */
 static void
 zil_lwb_write_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
 	spa_t *spa = zio->io_spa;
 	zilog_t *zilog = lwb->lwb_zilog;
 	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
 	lwb_t *nlwb;
 
 	ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
 
 	abd_free(zio->io_abd);
 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 	lwb->lwb_buf = NULL;
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
 	lwb->lwb_state = LWB_STATE_WRITE_DONE;
 	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
 
 	/*
 	 * If nlwb is not yet issued, zil_lwb_set_zio_dependency() is not
 	 * called for it yet, and when it will be, it won't be able to make
 	 * its write ZIO a parent this ZIO.  In such case we can not defer
 	 * our flushes or below may be a race between the done callbacks.
 	 */
 	nlwb = list_next(&zilog->zl_lwb_list, lwb);
 	if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
 		nlwb = NULL;
 	mutex_exit(&zilog->zl_lock);
 
 	if (avl_numnodes(t) == 0)
 		return;
 
 	/*
 	 * If there was an IO error, we're not going to call zio_flush()
 	 * on these vdevs, so we simply empty the tree and free the
 	 * nodes. We avoid calling zio_flush() since there isn't any
 	 * good reason for doing so, after the lwb block failed to be
 	 * written out.
 	 *
 	 * Additionally, we don't perform any further error handling at
 	 * this point (e.g. setting "zcw_zio_error" appropriately), as
 	 * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus,
 	 * we expect any error seen here, to have been propagated to
 	 * that function).
 	 */
 	if (zio->io_error != 0) {
 		while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
 			kmem_free(zv, sizeof (*zv));
 		return;
 	}
 
 	/*
 	 * If this lwb does not have any threads waiting for it to
 	 * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
 	 * command to the vdevs written to by "this" lwb, and instead
 	 * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
 	 * command for those vdevs. Thus, we merge the vdev tree of
 	 * "this" lwb with the vdev tree of the "next" lwb in the list,
 	 * and assume the "next" lwb will handle flushing the vdevs (or
 	 * deferring the flush(s) again).
 	 *
 	 * This is a useful performance optimization, especially for
 	 * workloads with lots of async write activity and few sync
 	 * write and/or fsync activity, as it has the potential to
 	 * coalesce multiple flush commands to a vdev into one.
 	 */
 	if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) {
 		zil_lwb_flush_defer(lwb, nlwb);
 		ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 		return;
 	}
 
 	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
 		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
 		if (vd != NULL) {
 			/*
 			 * The "ZIO_FLAG_DONT_PROPAGATE" is currently
 			 * always used within "zio_flush". This means,
 			 * any errors when flushing the vdev(s), will
 			 * (unfortunately) not be handled correctly,
 			 * since these "zio_flush" errors will not be
 			 * propagated up to "zil_lwb_flush_vdevs_done".
 			 */
 			zio_flush(lwb->lwb_root_zio, vd);
 		}
 		kmem_free(zv, sizeof (*zv));
 	}
 }
 
 /*
  * Build the zio dependency chain, which is used to preserve the ordering of
  * lwb completions that is required by the semantics of the ZIL. Each new lwb
  * zio becomes a parent of the previous lwb zio, such that the new lwb's zio
  * cannot complete until the previous lwb's zio completes.
  *
  * This is required by the semantics of zil_commit(): the commit waiters
  * attached to the lwbs will be woken in the lwb zio's completion callback,
  * so this zio dependency graph ensures the waiters are woken in the correct
  * order (the same order the lwbs were created).
  */
 static void
 zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 
 	lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb);
 	if (prev_lwb == NULL ||
 	    prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE)
 		return;
 
 	/*
 	 * If the previous lwb's write hasn't already completed, we also want
 	 * to order the completion of the lwb write zios (above, we only order
 	 * the completion of the lwb root zios). This is required because of
 	 * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb.
 	 *
 	 * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous
 	 * lwb will rely on this lwb to flush the vdevs written to by that
 	 * previous lwb. Thus, we need to ensure this lwb doesn't issue the
 	 * flush until after the previous lwb's write completes. We ensure
 	 * this ordering by setting the zio parent/child relationship here.
 	 *
 	 * Without this relationship on the lwb's write zio, it's possible
 	 * for this lwb's write to complete prior to the previous lwb's write
 	 * completing; and thus, the vdevs for the previous lwb would be
 	 * flushed prior to that lwb's data being written to those vdevs (the
 	 * vdevs are flushed in the lwb write zio's completion handler,
 	 * zil_lwb_write_done()).
 	 */
 	if (prev_lwb->lwb_state == LWB_STATE_ISSUED) {
 		ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL);
 		zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio);
 	} else {
 		ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	}
 
 	ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL);
 	zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio);
 }
 
 
 /*
  * This function's purpose is to "open" an lwb such that it is ready to
  * accept new itxs being committed to it. This function is idempotent; if
  * the passed in lwb has already been opened, it is essentially a no-op.
  */
 static void
 zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	if (lwb->lwb_state != LWB_STATE_NEW) {
 		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 		return;
 	}
 
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_state = LWB_STATE_OPENED;
 	zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
 }
 
 /*
  * Maximum block size used by the ZIL.  This is picked up when the ZIL is
  * initialized.  Otherwise this should not be used directly; see
  * zl_max_block_size instead.
  */
 static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
 
 /*
  * Plan splitting of the provided burst size between several blocks.
  */
 static uint_t
 zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
 {
 	uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t);
 
 	if (size <= md) {
 		/*
 		 * Small bursts are written as-is in one block.
 		 */
 		*minsize = size;
 		return (size);
 	} else if (size > 8 * md) {
 		/*
 		 * Big bursts use maximum blocks.  The first block size
 		 * is hard to predict, but it does not really matter.
 		 */
 		*minsize = 0;
 		return (md);
 	}
 
 	/*
 	 * Medium bursts try to divide evenly to better utilize several SLOG
 	 * VDEVs.  The first block size we predict assuming the worst case of
 	 * maxing out others.  Fall back to using maximum blocks if due to
 	 * large records or wasted space we can not predict anything better.
 	 */
 	uint_t s = size;
 	uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t));
 	uint_t chunk = DIV_ROUND_UP(s, n);
 	uint_t waste = zil_max_waste_space(zilog);
 	waste = MAX(waste, zilog->zl_cur_max);
 	if (chunk <= md - waste) {
 		*minsize = MAX(s - (md - waste) * (n - 1), waste);
 		return (chunk);
 	} else {
 		*minsize = 0;
 		return (md);
 	}
 }
 
 /*
  * Try to predict next block size based on previous history.  Make prediction
  * sufficient for 7 of 8 previous bursts.  Don't try to save if the saving is
  * less then 50%, extra writes may cost more, but we don't want single spike
  * to badly affect our predictions.
  */
 static uint_t
 zil_lwb_predict(zilog_t *zilog)
 {
 	uint_t m, o;
 
 	/* If we are in the middle of a burst, take it into account also. */
 	if (zilog->zl_cur_size > 0) {
 		o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m);
 	} else {
 		o = UINT_MAX;
 		m = 0;
 	}
 
 	/* Find minimum optimal size.  We don't need to go below that. */
 	for (int i = 0; i < ZIL_BURSTS; i++)
 		o = MIN(o, zilog->zl_prev_opt[i]);
 
 	/* Find two biggest minimal first block sizes above the optimal. */
 	uint_t m1 = MAX(m, o), m2 = o;
 	for (int i = 0; i < ZIL_BURSTS; i++) {
 		m = zilog->zl_prev_min[i];
 		if (m >= m1) {
 			m2 = m1;
 			m1 = m;
 		} else if (m > m2) {
 			m2 = m;
 		}
 	}
 
 	/*
 	 * If second minimum size gives 50% saving -- use it.  It may cost us
 	 * one additional write later, but the space saving is just too big.
 	 */
 	return ((m1 < m2 * 2) ? m1 : m2);
 }
 
 /*
  * Close the log block for being issued and allocate the next one.
  * Has to be called under zl_issuer_lock to chain more lwbs.
  */
 static lwb_t *
 zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
 {
 	uint64_t blksz, plan, plan2;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 	lwb->lwb_state = LWB_STATE_CLOSED;
 
 	/*
 	 * If there was an allocation failure then returned NULL will trigger
 	 * zil_commit_writer_stall() at the caller.  This is inherently racy,
 	 * since allocation may not have happened yet.
 	 */
 	if (lwb->lwb_error != 0)
 		return (NULL);
 
 	/*
 	 * Log blocks are pre-allocated.  Here we select the size of the next
 	 * block, based on what's left of this burst and the previous history.
 	 * While we try to only write used part of the block, we can't just
 	 * always allocate the maximum block size because we can exhaust all
 	 * available pool log space, so we try to be reasonable.
 	 */
 	if (zilog->zl_cur_left > 0) {
 		/*
 		 * We are in the middle of a burst and know how much is left.
 		 * But if workload is multi-threaded there may be more soon.
 		 * Try to predict what can it be and plan for the worst case.
 		 */
 		uint_t m;
 		plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
 		if (zilog->zl_parallel) {
 			plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left +
 			    zil_lwb_predict(zilog), &m);
 			if (plan < plan2)
 				plan = plan2;
 		}
 	} else {
 		/*
 		 * The previous burst is done and we can only predict what
 		 * will come next.
 		 */
 		plan = zil_lwb_predict(zilog);
 	}
 	blksz = plan + sizeof (zil_chain_t);
 	blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t);
 	blksz = MIN(blksz, zilog->zl_max_block_size);
 	DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz,
 	    uint64_t, plan);
 
 	return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state));
 }
 
 /*
  * Finalize previously closed block and issue the write zio.
  */
 static void
 zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 {
 	spa_t *spa = zilog->zl_spa;
 	zil_chain_t *zilc;
 	boolean_t slog;
 	zbookmark_phys_t zb;
 	zio_priority_t prio;
 	int error;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
 	/* Actually fill the lwb with the data. */
 	for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
 	    itx = list_next(&lwb->lwb_itxs, itx))
 		zil_lwb_commit(zilog, lwb, itx);
 	lwb->lwb_nused = lwb->lwb_nfilled;
 	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
 
 	lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb,
 	    ZIO_FLAG_CANFAIL);
 
 	/*
 	 * The lwb is now ready to be issued, but it can be only if it already
 	 * got its block pointer allocated or the allocation has failed.
 	 * Otherwise leave it as-is, relying on some other thread to issue it
 	 * after allocating its block pointer via calling zil_lwb_write_issue()
 	 * for the previous lwb(s) in the chain.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_state = LWB_STATE_READY;
 	if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) {
 		mutex_exit(&zilog->zl_lock);
 		return;
 	}
 	mutex_exit(&zilog->zl_lock);
 
 next_lwb:
 	if (lwb->lwb_slim)
 		zilc = (zil_chain_t *)lwb->lwb_buf;
 	else
 		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax);
 	int wsz = lwb->lwb_sz;
 	if (lwb->lwb_error == 0) {
 		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
 		if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk)
 			prio = ZIO_PRIORITY_SYNC_WRITE;
 		else
 			prio = ZIO_PRIORITY_ASYNC_WRITE;
 		SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 		    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 		    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
 		lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
 		    &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
 		    lwb, prio, ZIO_FLAG_CANFAIL, &zb);
 		zil_lwb_add_block(lwb, &lwb->lwb_blk);
 
 		if (lwb->lwb_slim) {
 			/* For Slim ZIL only write what is used. */
 			wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ,
 			    int);
 			ASSERT3S(wsz, <=, lwb->lwb_sz);
 			zio_shrink(lwb->lwb_write_zio, wsz);
 			wsz = lwb->lwb_write_zio->io_size;
 		}
 		memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
 		zilc->zc_pad = 0;
 		zilc->zc_nused = lwb->lwb_nused;
 		zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
 	} else {
 		/*
 		 * We can't write the lwb if there was an allocation failure,
 		 * so create a null zio instead just to maintain dependencies.
 		 */
 		lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL,
 		    zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL);
 		lwb->lwb_write_zio->io_error = lwb->lwb_error;
 	}
 	if (lwb->lwb_child_zio)
 		zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio);
 
 	/*
 	 * Open transaction to allocate the next block pointer.
 	 */
 	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	uint64_t txg = dmu_tx_get_txg(tx);
 
 	/*
 	 * Allocate next the block pointer unless we are already in error.
 	 */
 	lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb);
 	blkptr_t *bp = &zilc->zc_next_blk;
 	BP_ZERO(bp);
 	error = lwb->lwb_error;
 	if (error == 0) {
 		error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz,
 		    &slog);
 	}
 	if (error == 0) {
-		ASSERT3U(bp->blk_birth, ==, txg);
+		ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), ==, txg);
 		BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
 		    ZIO_CHECKSUM_ZILOG);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
 		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
 	}
 
 	/*
 	 * Reduce TXG open time by incrementing inflight counter and committing
 	 * the transaciton.  zil_sync() will wait for it to return to zero.
 	 */
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	lwb->lwb_issued_txg = txg;
 	zilog->zl_lwb_inflight[txg & TXG_MASK]++;
 	zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 	dmu_tx_commit(tx);
 
 	spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
 
 	/*
 	 * We've completed all potentially blocking operations.  Update the
 	 * nlwb and allow it proceed without possible lock order reversals.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	zil_lwb_set_zio_dependency(zilog, lwb);
 	lwb->lwb_state = LWB_STATE_ISSUED;
 
 	if (nlwb) {
 		nlwb->lwb_blk = *bp;
 		nlwb->lwb_error = error;
 		nlwb->lwb_slog = slog;
 		nlwb->lwb_alloc_txg = txg;
 		if (nlwb->lwb_state != LWB_STATE_READY)
 			nlwb = NULL;
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	if (lwb->lwb_slog) {
 		ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
 		    lwb->lwb_nused);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write,
 		    wsz);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc,
 		    BP_GET_LSIZE(&lwb->lwb_blk));
 	} else {
 		ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes,
 		    lwb->lwb_nused);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write,
 		    wsz);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc,
 		    BP_GET_LSIZE(&lwb->lwb_blk));
 	}
 	lwb->lwb_issued_timestamp = gethrtime();
 	if (lwb->lwb_child_zio)
 		zio_nowait(lwb->lwb_child_zio);
 	zio_nowait(lwb->lwb_write_zio);
 	zio_nowait(lwb->lwb_root_zio);
 
 	/*
 	 * If nlwb was ready when we gave it the block pointer,
 	 * it is on us to issue it and possibly following ones.
 	 */
 	lwb = nlwb;
 	if (lwb)
 		goto next_lwb;
 }
 
 /*
  * Maximum amount of data that can be put into single log block.
  */
 uint64_t
 zil_max_log_data(zilog_t *zilog, size_t hdrsize)
 {
 	return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize);
 }
 
 /*
  * Maximum amount of log space we agree to waste to reduce number of
  * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~6%).
  */
 static inline uint64_t
 zil_max_waste_space(zilog_t *zilog)
 {
 	return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 16);
 }
 
 /*
  * Maximum amount of write data for WR_COPIED.  For correctness, consumers
  * must fall back to WR_NEED_COPY if we can't fit the entire record into one
  * maximum sized log block, because each WR_COPIED record must fit in a
  * single log block.  Below that it is a tradeoff of additional memory copy
  * and possibly worse log space efficiency vs additional range lock/unlock.
  */
 static uint_t zil_maxcopied = 7680;
 
 uint64_t
 zil_max_copied_data(zilog_t *zilog)
 {
 	uint64_t max_data = zil_max_log_data(zilog, sizeof (lr_write_t));
 	return (MIN(max_data, zil_maxcopied));
 }
 
 static uint64_t
 zil_itx_record_size(itx_t *itx)
 {
 	lr_t *lr = &itx->itx_lr;
 
 	if (lr->lrc_txtype == TX_COMMIT)
 		return (0);
 	ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
 	return (lr->lrc_reclen);
 }
 
 static uint64_t
 zil_itx_data_size(itx_t *itx)
 {
 	lr_t *lr = &itx->itx_lr;
 	lr_write_t *lrw = (lr_write_t *)lr;
 
 	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
 		ASSERT3U(lr->lrc_reclen, ==, sizeof (lr_write_t));
 		return (P2ROUNDUP_TYPED(lrw->lr_length, sizeof (uint64_t),
 		    uint64_t));
 	}
 	return (0);
 }
 
 static uint64_t
 zil_itx_full_size(itx_t *itx)
 {
 	lr_t *lr = &itx->itx_lr;
 
 	if (lr->lrc_txtype == TX_COMMIT)
 		return (0);
 	ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
 	return (lr->lrc_reclen + zil_itx_data_size(itx));
 }
 
 /*
  * Estimate space needed in the lwb for the itx.  Allocate more lwbs or
  * split the itx as needed, but don't touch the actual transaction data.
  * Has to be called under zl_issuer_lock to call zil_lwb_write_close()
  * to chain more lwbs.
  */
 static lwb_t *
 zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
 {
 	itx_t *citx;
 	lr_t *lr, *clr;
 	lr_write_t *lrw;
 	uint64_t dlen, dnow, lwb_sp, reclen, max_log_data;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3P(lwb->lwb_buf, !=, NULL);
 
 	zil_lwb_write_open(zilog, lwb);
 
 	lr = &itx->itx_lr;
 	lrw = (lr_write_t *)lr;
 
 	/*
 	 * A commit itx doesn't represent any on-disk state; instead
 	 * it's simply used as a place holder on the commit list, and
 	 * provides a mechanism for attaching a "commit waiter" onto the
 	 * correct lwb (such that the waiter can be signalled upon
 	 * completion of that lwb). Thus, we don't process this itx's
 	 * log record if it's a commit itx (these itx's don't have log
 	 * records), and instead link the itx's waiter onto the lwb's
 	 * list of waiters.
 	 *
 	 * For more details, see the comment above zil_commit().
 	 */
 	if (lr->lrc_txtype == TX_COMMIT) {
 		zil_commit_waiter_link_lwb(itx->itx_private, lwb);
 		list_insert_tail(&lwb->lwb_itxs, itx);
 		return (lwb);
 	}
 
 	reclen = lr->lrc_reclen;
 	ASSERT3U(reclen, >=, sizeof (lr_t));
 	ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0));
 	dlen = zil_itx_data_size(itx);
 
 cont:
 	/*
 	 * If this record won't fit in the current log block, start a new one.
 	 * For WR_NEED_COPY optimize layout for minimal number of chunks.
 	 */
 	lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
 	max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t));
 	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
 	    lwb_sp < zil_max_waste_space(zilog) &&
 	    (dlen % max_log_data == 0 ||
 	    lwb_sp < reclen + dlen % max_log_data))) {
 		list_insert_tail(ilwbs, lwb);
 		lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED);
 		if (lwb == NULL)
 			return (NULL);
 		lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
 	}
 
 	/*
 	 * There must be enough space in the log block to hold reclen.
 	 * For WR_COPIED, we need to fit the whole record in one block,
 	 * and reclen is the write record header size + the data size.
 	 * For WR_NEED_COPY, we can create multiple records, splitting
 	 * the data into multiple blocks, so we only need to fit one
 	 * word of data per block; in this case reclen is just the header
 	 * size (no data).
 	 */
 	ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
 
 	dnow = MIN(dlen, lwb_sp - reclen);
 	if (dlen > dnow) {
 		ASSERT3U(lr->lrc_txtype, ==, TX_WRITE);
 		ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY);
 		citx = zil_itx_clone(itx);
 		clr = &citx->itx_lr;
 		lr_write_t *clrw = (lr_write_t *)clr;
 		clrw->lr_length = dnow;
 		lrw->lr_offset += dnow;
 		lrw->lr_length -= dnow;
 		zilog->zl_cur_left -= dnow;
 	} else {
 		citx = itx;
 		clr = lr;
 	}
 
 	/*
 	 * We're actually making an entry, so update lrc_seq to be the
 	 * log record sequence number.  Note that this is generally not
 	 * equal to the itx sequence number because not all transactions
 	 * are synchronous, and sometimes spa_sync() gets there first.
 	 */
 	clr->lrc_seq = ++zilog->zl_lr_seq;
 
 	lwb->lwb_nused += reclen + dnow;
 	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
 	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
 
 	zil_lwb_add_txg(lwb, lr->lrc_txg);
 	list_insert_tail(&lwb->lwb_itxs, citx);
 
 	dlen -= dnow;
 	if (dlen > 0)
 		goto cont;
 
 	if (lr->lrc_txtype == TX_WRITE &&
 	    lr->lrc_txg > spa_freeze_txg(zilog->zl_spa))
 		txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg);
 
 	return (lwb);
 }
 
 /*
  * Fill the actual transaction data into the lwb, following zil_lwb_assign().
  * Does not require locking.
  */
 static void
 zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
 {
 	lr_t *lr, *lrb;
 	lr_write_t *lrw, *lrwb;
 	char *lr_buf;
 	uint64_t dlen, reclen;
 
 	lr = &itx->itx_lr;
 	lrw = (lr_write_t *)lr;
 
 	if (lr->lrc_txtype == TX_COMMIT)
 		return;
 
 	reclen = lr->lrc_reclen;
 	dlen = zil_itx_data_size(itx);
 	ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled);
 
 	lr_buf = lwb->lwb_buf + lwb->lwb_nfilled;
 	memcpy(lr_buf, lr, reclen);
 	lrb = (lr_t *)lr_buf;		/* Like lr, but inside lwb. */
 	lrwb = (lr_write_t *)lrb;	/* Like lrw, but inside lwb. */
 
 	ZIL_STAT_BUMP(zilog, zil_itx_count);
 
 	/*
 	 * If it's a write, fetch the data or get its blkptr as appropriate.
 	 */
 	if (lr->lrc_txtype == TX_WRITE) {
 		if (itx->itx_wr_state == WR_COPIED) {
 			ZIL_STAT_BUMP(zilog, zil_itx_copied_count);
 			ZIL_STAT_INCR(zilog, zil_itx_copied_bytes,
 			    lrw->lr_length);
 		} else {
 			char *dbuf;
 			int error;
 
 			if (itx->itx_wr_state == WR_NEED_COPY) {
 				dbuf = lr_buf + reclen;
 				lrb->lrc_reclen += dlen;
 				ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count);
 				ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes,
 				    dlen);
 			} else {
 				ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT);
 				dbuf = NULL;
 				ZIL_STAT_BUMP(zilog, zil_itx_indirect_count);
 				ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes,
 				    lrw->lr_length);
 				if (lwb->lwb_child_zio == NULL) {
 					lwb->lwb_child_zio = zio_null(NULL,
 					    zilog->zl_spa, NULL, NULL, NULL,
 					    ZIO_FLAG_CANFAIL);
 				}
 			}
 
 			/*
 			 * The "lwb_child_zio" we pass in will become a child of
 			 * "lwb_write_zio", when one is created, so one will be
 			 * a parent of any zio's created by the "zl_get_data".
 			 * This way "lwb_write_zio" will first wait for children
 			 * block pointers before own writing, and then for their
 			 * writing completion before the vdev cache flushing.
 			 */
 			error = zilog->zl_get_data(itx->itx_private,
 			    itx->itx_gen, lrwb, dbuf, lwb,
 			    lwb->lwb_child_zio);
 			if (dbuf != NULL && error == 0) {
 				/* Zero any padding bytes in the last block. */
 				memset((char *)dbuf + lrwb->lr_length, 0,
 				    dlen - lrwb->lr_length);
 			}
 
 			/*
 			 * Typically, the only return values we should see from
 			 * ->zl_get_data() are 0, EIO, ENOENT, EEXIST or
 			 *  EALREADY. However, it is also possible to see other
 			 *  error values such as ENOSPC or EINVAL from
 			 *  dmu_read() -> dnode_hold() -> dnode_hold_impl() or
 			 *  ENXIO as well as a multitude of others from the
 			 *  block layer through dmu_buf_hold() -> dbuf_read()
 			 *  -> zio_wait(), as well as through dmu_read() ->
 			 *  dnode_hold() -> dnode_hold_impl() -> dbuf_read() ->
 			 *  zio_wait(). When these errors happen, we can assume
 			 *  that neither an immediate write nor an indirect
 			 *  write occurred, so we need to fall back to
 			 *  txg_wait_synced(). This is unusual, so we print to
 			 *  dmesg whenever one of these errors occurs.
 			 */
 			switch (error) {
 			case 0:
 				break;
 			default:
 				cmn_err(CE_WARN, "zil_lwb_commit() received "
 				    "unexpected error %d from ->zl_get_data()"
 				    ". Falling back to txg_wait_synced().",
 				    error);
 				zfs_fallthrough;
 			case EIO:
 				txg_wait_synced(zilog->zl_dmu_pool,
 				    lr->lrc_txg);
 				zfs_fallthrough;
 			case ENOENT:
 				zfs_fallthrough;
 			case EEXIST:
 				zfs_fallthrough;
 			case EALREADY:
 				return;
 			}
 		}
 	}
 
 	lwb->lwb_nfilled += reclen + dlen;
 	ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused);
 	ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t)));
 }
 
 itx_t *
 zil_itx_create(uint64_t txtype, size_t olrsize)
 {
 	size_t itxsize, lrsize;
 	itx_t *itx;
 
 	ASSERT3U(olrsize, >=, sizeof (lr_t));
 	lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t);
 	ASSERT3U(lrsize, >=, olrsize);
 	itxsize = offsetof(itx_t, itx_lr) + lrsize;
 
 	itx = zio_data_buf_alloc(itxsize);
 	itx->itx_lr.lrc_txtype = txtype;
 	itx->itx_lr.lrc_reclen = lrsize;
 	itx->itx_lr.lrc_seq = 0;	/* defensive */
 	memset((char *)&itx->itx_lr + olrsize, 0, lrsize - olrsize);
 	itx->itx_sync = B_TRUE;		/* default is synchronous */
 	itx->itx_callback = NULL;
 	itx->itx_callback_data = NULL;
 	itx->itx_size = itxsize;
 
 	return (itx);
 }
 
 static itx_t *
 zil_itx_clone(itx_t *oitx)
 {
 	ASSERT3U(oitx->itx_size, >=, sizeof (itx_t));
 	ASSERT3U(oitx->itx_size, ==,
 	    offsetof(itx_t, itx_lr) + oitx->itx_lr.lrc_reclen);
 
 	itx_t *itx = zio_data_buf_alloc(oitx->itx_size);
 	memcpy(itx, oitx, oitx->itx_size);
 	itx->itx_callback = NULL;
 	itx->itx_callback_data = NULL;
 	return (itx);
 }
 
 void
 zil_itx_destroy(itx_t *itx)
 {
 	ASSERT3U(itx->itx_size, >=, sizeof (itx_t));
 	ASSERT3U(itx->itx_lr.lrc_reclen, ==,
 	    itx->itx_size - offsetof(itx_t, itx_lr));
 	IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL);
 	IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
 
 	if (itx->itx_callback != NULL)
 		itx->itx_callback(itx->itx_callback_data);
 
 	zio_data_buf_free(itx, itx->itx_size);
 }
 
 /*
  * Free up the sync and async itxs. The itxs_t has already been detached
  * so no locks are needed.
  */
 static void
 zil_itxg_clean(void *arg)
 {
 	itx_t *itx;
 	list_t *list;
 	avl_tree_t *t;
 	void *cookie;
 	itxs_t *itxs = arg;
 	itx_async_node_t *ian;
 
 	list = &itxs->i_sync_list;
 	while ((itx = list_remove_head(list)) != NULL) {
 		/*
 		 * In the general case, commit itxs will not be found
 		 * here, as they'll be committed to an lwb via
 		 * zil_lwb_assign(), and free'd in that function. Having
 		 * said that, it is still possible for commit itxs to be
 		 * found here, due to the following race:
 		 *
 		 *	- a thread calls zil_commit() which assigns the
 		 *	  commit itx to a per-txg i_sync_list
 		 *	- zil_itxg_clean() is called (e.g. via spa_sync())
 		 *	  while the waiter is still on the i_sync_list
 		 *
 		 * There's nothing to prevent syncing the txg while the
 		 * waiter is on the i_sync_list. This normally doesn't
 		 * happen because spa_sync() is slower than zil_commit(),
 		 * but if zil_commit() calls txg_wait_synced() (e.g.
 		 * because zil_create() or zil_commit_writer_stall() is
 		 * called) we will hit this case.
 		 */
 		if (itx->itx_lr.lrc_txtype == TX_COMMIT)
 			zil_commit_waiter_skip(itx->itx_private);
 
 		zil_itx_destroy(itx);
 	}
 
 	cookie = NULL;
 	t = &itxs->i_async_tree;
 	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 		list = &ian->ia_list;
 		while ((itx = list_remove_head(list)) != NULL) {
 			/* commit itxs should never be on the async lists. */
 			ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 			zil_itx_destroy(itx);
 		}
 		list_destroy(list);
 		kmem_free(ian, sizeof (itx_async_node_t));
 	}
 	avl_destroy(t);
 
 	kmem_free(itxs, sizeof (itxs_t));
 }
 
 static int
 zil_aitx_compare(const void *x1, const void *x2)
 {
 	const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
 	const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
 
 	return (TREE_CMP(o1, o2));
 }
 
 /*
  * Remove all async itx with the given oid.
  */
 void
 zil_remove_async(zilog_t *zilog, uint64_t oid)
 {
 	uint64_t otxg, txg;
 	itx_async_node_t *ian, ian_search;
 	avl_tree_t *t;
 	avl_index_t where;
 	list_t clean_list;
 	itx_t *itx;
 
 	ASSERT(oid != 0);
 	list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * Locate the object node and append its list.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		ian_search.ia_foid = oid;
 		ian = avl_find(t, &ian_search, &where);
 		if (ian != NULL)
 			list_move_tail(&clean_list, &ian->ia_list);
 		mutex_exit(&itxg->itxg_lock);
 	}
 	while ((itx = list_remove_head(&clean_list)) != NULL) {
 		/* commit itxs should never be on the async lists. */
 		ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 		zil_itx_destroy(itx);
 	}
 	list_destroy(&clean_list);
 }
 
 void
 zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 {
 	uint64_t txg;
 	itxg_t *itxg;
 	itxs_t *itxs, *clean = NULL;
 
 	/*
 	 * Ensure the data of a renamed file is committed before the rename.
 	 */
 	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
 		zil_async_to_sync(zilog, itx->itx_oid);
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
 		txg = ZILTEST_TXG;
 	else
 		txg = dmu_tx_get_txg(tx);
 
 	itxg = &zilog->zl_itxg[txg & TXG_MASK];
 	mutex_enter(&itxg->itxg_lock);
 	itxs = itxg->itxg_itxs;
 	if (itxg->itxg_txg != txg) {
 		if (itxs != NULL) {
 			/*
 			 * The zil_clean callback hasn't got around to cleaning
 			 * this itxg. Save the itxs for release below.
 			 * This should be rare.
 			 */
 			zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
 			    "txg %llu", (u_longlong_t)itxg->itxg_txg);
 			clean = itxg->itxg_itxs;
 		}
 		itxg->itxg_txg = txg;
 		itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t),
 		    KM_SLEEP);
 
 		list_create(&itxs->i_sync_list, sizeof (itx_t),
 		    offsetof(itx_t, itx_node));
 		avl_create(&itxs->i_async_tree, zil_aitx_compare,
 		    sizeof (itx_async_node_t),
 		    offsetof(itx_async_node_t, ia_node));
 	}
 	if (itx->itx_sync) {
 		list_insert_tail(&itxs->i_sync_list, itx);
 	} else {
 		avl_tree_t *t = &itxs->i_async_tree;
 		uint64_t foid =
 		    LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid);
 		itx_async_node_t *ian;
 		avl_index_t where;
 
 		ian = avl_find(t, &foid, &where);
 		if (ian == NULL) {
 			ian = kmem_alloc(sizeof (itx_async_node_t),
 			    KM_SLEEP);
 			list_create(&ian->ia_list, sizeof (itx_t),
 			    offsetof(itx_t, itx_node));
 			ian->ia_foid = foid;
 			avl_insert(t, ian, where);
 		}
 		list_insert_tail(&ian->ia_list, itx);
 	}
 
 	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
 
 	/*
 	 * We don't want to dirty the ZIL using ZILTEST_TXG, because
 	 * zil_clean() will never be called using ZILTEST_TXG. Thus, we
 	 * need to be careful to always dirty the ZIL using the "real"
 	 * TXG (not itxg_txg) even when the SPA is frozen.
 	 */
 	zilog_dirty(zilog, dmu_tx_get_txg(tx));
 	mutex_exit(&itxg->itxg_lock);
 
 	/* Release the old itxs now we've dropped the lock */
 	if (clean != NULL)
 		zil_itxg_clean(clean);
 }
 
 /*
  * If there are any in-memory intent log transactions which have now been
  * synced then start up a taskq to free them. We should only do this after we
  * have written out the uberblocks (i.e. txg has been committed) so that
  * don't inadvertently clean out in-memory log records that would be required
  * by zil_commit().
  */
 void
 zil_clean(zilog_t *zilog, uint64_t synced_txg)
 {
 	itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
 	itxs_t *clean_me;
 
 	ASSERT3U(synced_txg, <, ZILTEST_TXG);
 
 	mutex_enter(&itxg->itxg_lock);
 	if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
 		mutex_exit(&itxg->itxg_lock);
 		return;
 	}
 	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
 	ASSERT3U(itxg->itxg_txg, !=, 0);
 	clean_me = itxg->itxg_itxs;
 	itxg->itxg_itxs = NULL;
 	itxg->itxg_txg = 0;
 	mutex_exit(&itxg->itxg_lock);
 	/*
 	 * Preferably start a task queue to free up the old itxs but
 	 * if taskq_dispatch can't allocate resources to do that then
 	 * free it in-line. This should be rare. Note, using TQ_SLEEP
 	 * created a bad performance problem.
 	 */
 	ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
 	ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
 	taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
 	    zil_itxg_clean, clean_me, TQ_NOSLEEP);
 	if (id == TASKQID_INVALID)
 		zil_itxg_clean(clean_me);
 }
 
 /*
  * This function will traverse the queue of itxs that need to be
  * committed, and move them onto the ZIL's zl_itx_commit_list.
  */
 static uint64_t
 zil_get_commit_list(zilog_t *zilog)
 {
 	uint64_t otxg, txg, wtxg = 0;
 	list_t *commit_list = &zilog->zl_itx_commit_list;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	/*
 	 * This is inherently racy, since there is nothing to prevent
 	 * the last synced txg from changing. That's okay since we'll
 	 * only commit things in the future.
 	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * If we're adding itx records to the zl_itx_commit_list,
 		 * then the zil better be dirty in this "txg". We can assert
 		 * that here since we're holding the itxg_lock which will
 		 * prevent spa_sync from cleaning it. Once we add the itxs
 		 * to the zl_itx_commit_list we must commit it to disk even
 		 * if it's unnecessary (i.e. the txg was synced).
 		 */
 		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
 		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 		list_t *sync_list = &itxg->itxg_itxs->i_sync_list;
 		itx_t *itx = NULL;
 		if (unlikely(zilog->zl_suspend > 0)) {
 			/*
 			 * ZIL was just suspended, but we lost the race.
 			 * Allow all earlier itxs to be committed, but ask
 			 * caller to do txg_wait_synced(txg) for any new.
 			 */
 			if (!list_is_empty(sync_list))
 				wtxg = MAX(wtxg, txg);
 		} else {
 			itx = list_head(sync_list);
 			list_move_tail(commit_list, sync_list);
 		}
 
 		mutex_exit(&itxg->itxg_lock);
 
 		while (itx != NULL) {
 			uint64_t s = zil_itx_full_size(itx);
 			zilog->zl_cur_size += s;
 			zilog->zl_cur_left += s;
 			s = zil_itx_record_size(itx);
 			zilog->zl_cur_max = MAX(zilog->zl_cur_max, s);
 			itx = list_next(commit_list, itx);
 		}
 	}
 	return (wtxg);
 }
 
 /*
  * Move the async itxs for a specified object to commit into sync lists.
  */
 void
 zil_async_to_sync(zilog_t *zilog, uint64_t foid)
 {
 	uint64_t otxg, txg;
 	itx_async_node_t *ian, ian_search;
 	avl_tree_t *t;
 	avl_index_t where;
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	/*
 	 * This is inherently racy, since there is nothing to prevent
 	 * the last synced txg from changing.
 	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * If a foid is specified then find that node and append its
 		 * list. Otherwise walk the tree appending all the lists
 		 * to the sync list. We add to the end rather than the
 		 * beginning to ensure the create has happened.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		if (foid != 0) {
 			ian_search.ia_foid = foid;
 			ian = avl_find(t, &ian_search, &where);
 			if (ian != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 			}
 		} else {
 			void *cookie = NULL;
 
 			while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 				list_destroy(&ian->ia_list);
 				kmem_free(ian, sizeof (itx_async_node_t));
 			}
 		}
 		mutex_exit(&itxg->itxg_lock);
 	}
 }
 
 /*
  * This function will prune commit itxs that are at the head of the
  * commit list (it won't prune past the first non-commit itx), and
  * either: a) attach them to the last lwb that's still pending
  * completion, or b) skip them altogether.
  *
  * This is used as a performance optimization to prevent commit itxs
  * from generating new lwbs when it's unnecessary to do so.
  */
 static void
 zil_prune_commit_list(zilog_t *zilog)
 {
 	itx_t *itx;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
 		lr_t *lrc = &itx->itx_lr;
 		if (lrc->lrc_txtype != TX_COMMIT)
 			break;
 
 		mutex_enter(&zilog->zl_lock);
 
 		lwb_t *last_lwb = zilog->zl_last_lwb_opened;
 		if (last_lwb == NULL ||
 		    last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
 			/*
 			 * All of the itxs this waiter was waiting on
 			 * must have already completed (or there were
 			 * never any itx's for it to wait on), so it's
 			 * safe to skip this waiter and mark it done.
 			 */
 			zil_commit_waiter_skip(itx->itx_private);
 		} else {
 			zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
 		}
 
 		mutex_exit(&zilog->zl_lock);
 
 		list_remove(&zilog->zl_itx_commit_list, itx);
 		zil_itx_destroy(itx);
 	}
 
 	IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
 }
 
 static void
 zil_commit_writer_stall(zilog_t *zilog)
 {
 	/*
 	 * When zio_alloc_zil() fails to allocate the next lwb block on
 	 * disk, we must call txg_wait_synced() to ensure all of the
 	 * lwbs in the zilog's zl_lwb_list are synced and then freed (in
 	 * zil_sync()), such that any subsequent ZIL writer (i.e. a call
 	 * to zil_process_commit_list()) will have to call zil_create(),
 	 * and start a new ZIL chain.
 	 *
 	 * Since zil_alloc_zil() failed, the lwb that was previously
 	 * issued does not have a pointer to the "next" lwb on disk.
 	 * Thus, if another ZIL writer thread was to allocate the "next"
 	 * on-disk lwb, that block could be leaked in the event of a
 	 * crash (because the previous lwb on-disk would not point to
 	 * it).
 	 *
 	 * We must hold the zilog's zl_issuer_lock while we do this, to
 	 * ensure no new threads enter zil_process_commit_list() until
 	 * all lwb's in the zl_lwb_list have been synced and freed
 	 * (which is achieved via the txg_wait_synced() call).
 	 */
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 }
 
 static void
 zil_burst_done(zilog_t *zilog)
 {
 	if (!list_is_empty(&zilog->zl_itx_commit_list) ||
 	    zilog->zl_cur_size == 0)
 		return;
 
 	if (zilog->zl_parallel)
 		zilog->zl_parallel--;
 
 	uint_t r = (zilog->zl_prev_rotor + 1) & (ZIL_BURSTS - 1);
 	zilog->zl_prev_rotor = r;
 	zilog->zl_prev_opt[r] = zil_lwb_plan(zilog, zilog->zl_cur_size,
 	    &zilog->zl_prev_min[r]);
 
 	zilog->zl_cur_size = 0;
 	zilog->zl_cur_max = 0;
 	zilog->zl_cur_left = 0;
 }
 
 /*
  * This function will traverse the commit list, creating new lwbs as
  * needed, and committing the itxs from the commit list to these newly
  * created lwbs. Additionally, as a new lwb is created, the previous
  * lwb will be issued to the zio layer to be written to disk.
  */
 static void
 zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 {
 	spa_t *spa = zilog->zl_spa;
 	list_t nolwb_itxs;
 	list_t nolwb_waiters;
 	lwb_t *lwb, *plwb;
 	itx_t *itx;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	/*
 	 * Return if there's nothing to commit before we dirty the fs by
 	 * calling zil_create().
 	 */
 	if (list_is_empty(&zilog->zl_itx_commit_list))
 		return;
 
 	list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
 	list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
 	    offsetof(zil_commit_waiter_t, zcw_node));
 
 	lwb = list_tail(&zilog->zl_lwb_list);
 	if (lwb == NULL) {
 		lwb = zil_create(zilog);
 	} else {
 		/*
 		 * Activate SPA_FEATURE_ZILSAXATTR for the cases where ZIL will
 		 * have already been created (zl_lwb_list not empty).
 		 */
 		zil_commit_activate_saxattr_feature(zilog);
 		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 		    lwb->lwb_state == LWB_STATE_OPENED);
 
 		/*
 		 * If the lwb is still opened, it means the workload is really
 		 * multi-threaded and we won the chance of write aggregation.
 		 * If it is not opened yet, but previous lwb is still not
 		 * flushed, it still means the workload is multi-threaded, but
 		 * there was too much time between the commits to aggregate, so
 		 * we try aggregation next times, but without too much hopes.
 		 */
 		if (lwb->lwb_state == LWB_STATE_OPENED) {
 			zilog->zl_parallel = ZIL_BURSTS;
 		} else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb))
 		    != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) {
 			zilog->zl_parallel = MAX(zilog->zl_parallel,
 			    ZIL_BURSTS / 2);
 		}
 	}
 
 	while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
 		lr_t *lrc = &itx->itx_lr;
 		uint64_t txg = lrc->lrc_txg;
 
 		ASSERT3U(txg, !=, 0);
 
 		if (lrc->lrc_txtype == TX_COMMIT) {
 			DTRACE_PROBE2(zil__process__commit__itx,
 			    zilog_t *, zilog, itx_t *, itx);
 		} else {
 			DTRACE_PROBE2(zil__process__normal__itx,
 			    zilog_t *, zilog, itx_t *, itx);
 		}
 
 		boolean_t synced = txg <= spa_last_synced_txg(spa);
 		boolean_t frozen = txg > spa_freeze_txg(spa);
 
 		/*
 		 * If the txg of this itx has already been synced out, then
 		 * we don't need to commit this itx to an lwb. This is
 		 * because the data of this itx will have already been
 		 * written to the main pool. This is inherently racy, and
 		 * it's still ok to commit an itx whose txg has already
 		 * been synced; this will result in a write that's
 		 * unnecessary, but will do no harm.
 		 *
 		 * With that said, we always want to commit TX_COMMIT itxs
 		 * to an lwb, regardless of whether or not that itx's txg
 		 * has been synced out. We do this to ensure any OPENED lwb
 		 * will always have at least one zil_commit_waiter_t linked
 		 * to the lwb.
 		 *
 		 * As a counter-example, if we skipped TX_COMMIT itx's
 		 * whose txg had already been synced, the following
 		 * situation could occur if we happened to be racing with
 		 * spa_sync:
 		 *
 		 * 1. We commit a non-TX_COMMIT itx to an lwb, where the
 		 *    itx's txg is 10 and the last synced txg is 9.
 		 * 2. spa_sync finishes syncing out txg 10.
 		 * 3. We move to the next itx in the list, it's a TX_COMMIT
 		 *    whose txg is 10, so we skip it rather than committing
 		 *    it to the lwb used in (1).
 		 *
 		 * If the itx that is skipped in (3) is the last TX_COMMIT
 		 * itx in the commit list, than it's possible for the lwb
 		 * used in (1) to remain in the OPENED state indefinitely.
 		 *
 		 * To prevent the above scenario from occurring, ensuring
 		 * that once an lwb is OPENED it will transition to ISSUED
 		 * and eventually DONE, we always commit TX_COMMIT itx's to
 		 * an lwb here, even if that itx's txg has already been
 		 * synced.
 		 *
 		 * Finally, if the pool is frozen, we _always_ commit the
 		 * itx.  The point of freezing the pool is to prevent data
 		 * from being written to the main pool via spa_sync, and
 		 * instead rely solely on the ZIL to persistently store the
 		 * data; i.e.  when the pool is frozen, the last synced txg
 		 * value can't be trusted.
 		 */
 		if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
 			if (lwb != NULL) {
 				lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs);
 				if (lwb == NULL) {
 					list_insert_tail(&nolwb_itxs, itx);
 				} else if ((zcw->zcw_lwb != NULL &&
 				    zcw->zcw_lwb != lwb) || zcw->zcw_done) {
 					/*
 					 * Our lwb is done, leave the rest of
 					 * itx list to somebody else who care.
 					 */
 					zilog->zl_parallel = ZIL_BURSTS;
 					zilog->zl_cur_left -=
 					    zil_itx_full_size(itx);
 					break;
 				}
 			} else {
 				if (lrc->lrc_txtype == TX_COMMIT) {
 					zil_commit_waiter_link_nolwb(
 					    itx->itx_private, &nolwb_waiters);
 				}
 				list_insert_tail(&nolwb_itxs, itx);
 			}
 			zilog->zl_cur_left -= zil_itx_full_size(itx);
 		} else {
 			ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT);
 			zilog->zl_cur_left -= zil_itx_full_size(itx);
 			zil_itx_destroy(itx);
 		}
 	}
 
 	if (lwb == NULL) {
 		/*
 		 * This indicates zio_alloc_zil() failed to allocate the
 		 * "next" lwb on-disk. When this happens, we must stall
 		 * the ZIL write pipeline; see the comment within
 		 * zil_commit_writer_stall() for more details.
 		 */
 		while ((lwb = list_remove_head(ilwbs)) != NULL)
 			zil_lwb_write_issue(zilog, lwb);
 		zil_commit_writer_stall(zilog);
 
 		/*
 		 * Additionally, we have to signal and mark the "nolwb"
 		 * waiters as "done" here, since without an lwb, we
 		 * can't do this via zil_lwb_flush_vdevs_done() like
 		 * normal.
 		 */
 		zil_commit_waiter_t *zcw;
 		while ((zcw = list_remove_head(&nolwb_waiters)) != NULL)
 			zil_commit_waiter_skip(zcw);
 
 		/*
 		 * And finally, we have to destroy the itx's that
 		 * couldn't be committed to an lwb; this will also call
 		 * the itx's callback if one exists for the itx.
 		 */
 		while ((itx = list_remove_head(&nolwb_itxs)) != NULL)
 			zil_itx_destroy(itx);
 	} else {
 		ASSERT(list_is_empty(&nolwb_waiters));
 		ASSERT3P(lwb, !=, NULL);
 		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 		    lwb->lwb_state == LWB_STATE_OPENED);
 
 		/*
 		 * At this point, the ZIL block pointed at by the "lwb"
 		 * variable is in "new" or "opened" state.
 		 *
 		 * If it's "new", then no itxs have been committed to it, so
 		 * there's no point in issuing its zio (i.e. it's "empty").
 		 *
 		 * If it's "opened", then it contains one or more itxs that
 		 * eventually need to be committed to stable storage. In
 		 * this case we intentionally do not issue the lwb's zio
 		 * to disk yet, and instead rely on one of the following
 		 * two mechanisms for issuing the zio:
 		 *
 		 * 1. Ideally, there will be more ZIL activity occurring on
 		 * the system, such that this function will be immediately
 		 * called again by different thread and this lwb will be
 		 * closed by zil_lwb_assign().  This way, the lwb will be
 		 * "full" when it is issued to disk, and we'll make use of
 		 * the lwb's size the best we can.
 		 *
 		 * 2. If there isn't sufficient ZIL activity occurring on
 		 * the system, zil_commit_waiter() will close it and issue
 		 * the zio.  If this occurs, the lwb is not guaranteed
 		 * to be "full" by the time its zio is issued, and means
 		 * the size of the lwb was "too large" given the amount
 		 * of ZIL activity occurring on the system at that time.
 		 *
 		 * We do this for a couple of reasons:
 		 *
 		 * 1. To try and reduce the number of IOPs needed to
 		 * write the same number of itxs. If an lwb has space
 		 * available in its buffer for more itxs, and more itxs
 		 * will be committed relatively soon (relative to the
 		 * latency of performing a write), then it's beneficial
 		 * to wait for these "next" itxs. This way, more itxs
 		 * can be committed to stable storage with fewer writes.
 		 *
 		 * 2. To try and use the largest lwb block size that the
 		 * incoming rate of itxs can support. Again, this is to
 		 * try and pack as many itxs into as few lwbs as
 		 * possible, without significantly impacting the latency
 		 * of each individual itx.
 		 */
 		if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
 			zil_burst_done(zilog);
 			list_insert_tail(ilwbs, lwb);
 			lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
 			if (lwb == NULL) {
 				while ((lwb = list_remove_head(ilwbs)) != NULL)
 					zil_lwb_write_issue(zilog, lwb);
 				zil_commit_writer_stall(zilog);
 			}
 		}
 	}
 }
 
 /*
  * This function is responsible for ensuring the passed in commit waiter
  * (and associated commit itx) is committed to an lwb. If the waiter is
  * not already committed to an lwb, all itxs in the zilog's queue of
  * itxs will be processed. The assumption is the passed in waiter's
  * commit itx will found in the queue just like the other non-commit
  * itxs, such that when the entire queue is processed, the waiter will
  * have been committed to an lwb.
  *
  * The lwb associated with the passed in waiter is not guaranteed to
  * have been issued by the time this function completes. If the lwb is
  * not issued, we rely on future calls to zil_commit_writer() to issue
  * the lwb, or the timeout mechanism found in zil_commit_waiter().
  */
 static uint64_t
 zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	list_t ilwbs;
 	lwb_t *lwb;
 	uint64_t wtxg = 0;
 
 	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node));
 	mutex_enter(&zilog->zl_issuer_lock);
 
 	if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
 		/*
 		 * It's possible that, while we were waiting to acquire
 		 * the "zl_issuer_lock", another thread committed this
 		 * waiter to an lwb. If that occurs, we bail out early,
 		 * without processing any of the zilog's queue of itxs.
 		 *
 		 * On certain workloads and system configurations, the
 		 * "zl_issuer_lock" can become highly contended. In an
 		 * attempt to reduce this contention, we immediately drop
 		 * the lock if the waiter has already been processed.
 		 *
 		 * We've measured this optimization to reduce CPU spent
 		 * contending on this lock by up to 5%, using a system
 		 * with 32 CPUs, low latency storage (~50 usec writes),
 		 * and 1024 threads performing sync writes.
 		 */
 		goto out;
 	}
 
 	ZIL_STAT_BUMP(zilog, zil_commit_writer_count);
 
 	wtxg = zil_get_commit_list(zilog);
 	zil_prune_commit_list(zilog);
 	zil_process_commit_list(zilog, zcw, &ilwbs);
 
 out:
 	mutex_exit(&zilog->zl_issuer_lock);
 	while ((lwb = list_remove_head(&ilwbs)) != NULL)
 		zil_lwb_write_issue(zilog, lwb);
 	list_destroy(&ilwbs);
 	return (wtxg);
 }
 
 static void
 zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
 	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 
 	lwb_t *lwb = zcw->zcw_lwb;
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
 
 	/*
 	 * If the lwb has already been issued by another thread, we can
 	 * immediately return since there's no work to be done (the
 	 * point of this function is to issue the lwb). Additionally, we
 	 * do this prior to acquiring the zl_issuer_lock, to avoid
 	 * acquiring it when it's not necessary to do so.
 	 */
 	if (lwb->lwb_state != LWB_STATE_OPENED)
 		return;
 
 	/*
 	 * In order to call zil_lwb_write_close() we must hold the
 	 * zilog's "zl_issuer_lock". We can't simply acquire that lock,
 	 * since we're already holding the commit waiter's "zcw_lock",
 	 * and those two locks are acquired in the opposite order
 	 * elsewhere.
 	 */
 	mutex_exit(&zcw->zcw_lock);
 	mutex_enter(&zilog->zl_issuer_lock);
 	mutex_enter(&zcw->zcw_lock);
 
 	/*
 	 * Since we just dropped and re-acquired the commit waiter's
 	 * lock, we have to re-check to see if the waiter was marked
 	 * "done" during that process. If the waiter was marked "done",
 	 * the "lwb" pointer is no longer valid (it can be free'd after
 	 * the waiter is marked "done"), so without this check we could
 	 * wind up with a use-after-free error below.
 	 */
 	if (zcw->zcw_done) {
 		mutex_exit(&zilog->zl_issuer_lock);
 		return;
 	}
 
 	ASSERT3P(lwb, ==, zcw->zcw_lwb);
 
 	/*
 	 * We've already checked this above, but since we hadn't acquired
 	 * the zilog's zl_issuer_lock, we have to perform this check a
 	 * second time while holding the lock.
 	 *
 	 * We don't need to hold the zl_lock since the lwb cannot transition
 	 * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb
 	 * _can_ transition from CLOSED to DONE, but it's OK to race with
 	 * that transition since we treat the lwb the same, whether it's in
 	 * the CLOSED, ISSUED or DONE states.
 	 *
 	 * The important thing, is we treat the lwb differently depending on
 	 * if it's OPENED or CLOSED, and block any other threads that might
 	 * attempt to close/issue this lwb. For that reason we hold the
 	 * zl_issuer_lock when checking the lwb_state; we must not call
 	 * zil_lwb_write_close() if the lwb had already been closed/issued.
 	 *
 	 * See the comment above the lwb_state_t structure definition for
 	 * more details on the lwb states, and locking requirements.
 	 */
 	if (lwb->lwb_state != LWB_STATE_OPENED) {
 		mutex_exit(&zilog->zl_issuer_lock);
 		return;
 	}
 
 	/*
 	 * We do not need zcw_lock once we hold zl_issuer_lock and know lwb
 	 * is still open.  But we have to drop it to avoid a deadlock in case
 	 * callback of zio issued by zil_lwb_write_issue() try to get it,
 	 * while zil_lwb_write_issue() is blocked on attempt to issue next
 	 * lwb it found in LWB_STATE_READY state.
 	 */
 	mutex_exit(&zcw->zcw_lock);
 
 	/*
 	 * As described in the comments above zil_commit_waiter() and
 	 * zil_process_commit_list(), we need to issue this lwb's zio
 	 * since we've reached the commit waiter's timeout and it still
 	 * hasn't been issued.
 	 */
 	zil_burst_done(zilog);
 	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
 	if (nlwb == NULL) {
 		/*
 		 * When zil_lwb_write_close() returns NULL, this
 		 * indicates zio_alloc_zil() failed to allocate the
 		 * "next" lwb on-disk. When this occurs, the ZIL write
 		 * pipeline must be stalled; see the comment within the
 		 * zil_commit_writer_stall() function for more details.
 		 */
 		zil_lwb_write_issue(zilog, lwb);
 		zil_commit_writer_stall(zilog);
 		mutex_exit(&zilog->zl_issuer_lock);
 	} else {
 		mutex_exit(&zilog->zl_issuer_lock);
 		zil_lwb_write_issue(zilog, lwb);
 	}
 	mutex_enter(&zcw->zcw_lock);
 }
 
 /*
  * This function is responsible for performing the following two tasks:
  *
  * 1. its primary responsibility is to block until the given "commit
  *    waiter" is considered "done".
  *
  * 2. its secondary responsibility is to issue the zio for the lwb that
  *    the given "commit waiter" is waiting on, if this function has
  *    waited "long enough" and the lwb is still in the "open" state.
  *
  * Given a sufficient amount of itxs being generated and written using
  * the ZIL, the lwb's zio will be issued via the zil_lwb_assign()
  * function. If this does not occur, this secondary responsibility will
  * ensure the lwb is issued even if there is not other synchronous
  * activity on the system.
  *
  * For more details, see zil_process_commit_list(); more specifically,
  * the comment at the bottom of that function.
  */
 static void
 zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	mutex_enter(&zcw->zcw_lock);
 
 	/*
 	 * The timeout is scaled based on the lwb latency to avoid
 	 * significantly impacting the latency of each individual itx.
 	 * For more details, see the comment at the bottom of the
 	 * zil_process_commit_list() function.
 	 */
 	int pct = MAX(zfs_commit_timeout_pct, 1);
 	hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
 	hrtime_t wakeup = gethrtime() + sleep;
 	boolean_t timedout = B_FALSE;
 
 	while (!zcw->zcw_done) {
 		ASSERT(MUTEX_HELD(&zcw->zcw_lock));
 
 		lwb_t *lwb = zcw->zcw_lwb;
 
 		/*
 		 * Usually, the waiter will have a non-NULL lwb field here,
 		 * but it's possible for it to be NULL as a result of
 		 * zil_commit() racing with spa_sync().
 		 *
 		 * When zil_clean() is called, it's possible for the itxg
 		 * list (which may be cleaned via a taskq) to contain
 		 * commit itxs. When this occurs, the commit waiters linked
 		 * off of these commit itxs will not be committed to an
 		 * lwb.  Additionally, these commit waiters will not be
 		 * marked done until zil_commit_waiter_skip() is called via
 		 * zil_itxg_clean().
 		 *
 		 * Thus, it's possible for this commit waiter (i.e. the
 		 * "zcw" variable) to be found in this "in between" state;
 		 * where it's "zcw_lwb" field is NULL, and it hasn't yet
 		 * been skipped, so it's "zcw_done" field is still B_FALSE.
 		 */
 		IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW);
 
 		if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
 			ASSERT3B(timedout, ==, B_FALSE);
 
 			/*
 			 * If the lwb hasn't been issued yet, then we
 			 * need to wait with a timeout, in case this
 			 * function needs to issue the lwb after the
 			 * timeout is reached; responsibility (2) from
 			 * the comment above this function.
 			 */
 			int rc = cv_timedwait_hires(&zcw->zcw_cv,
 			    &zcw->zcw_lock, wakeup, USEC2NSEC(1),
 			    CALLOUT_FLAG_ABSOLUTE);
 
 			if (rc != -1 || zcw->zcw_done)
 				continue;
 
 			timedout = B_TRUE;
 			zil_commit_waiter_timeout(zilog, zcw);
 
 			if (!zcw->zcw_done) {
 				/*
 				 * If the commit waiter has already been
 				 * marked "done", it's possible for the
 				 * waiter's lwb structure to have already
 				 * been freed.  Thus, we can only reliably
 				 * make these assertions if the waiter
 				 * isn't done.
 				 */
 				ASSERT3P(lwb, ==, zcw->zcw_lwb);
 				ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
 			}
 		} else {
 			/*
 			 * If the lwb isn't open, then it must have already
 			 * been issued. In that case, there's no need to
 			 * use a timeout when waiting for the lwb to
 			 * complete.
 			 *
 			 * Additionally, if the lwb is NULL, the waiter
 			 * will soon be signaled and marked done via
 			 * zil_clean() and zil_itxg_clean(), so no timeout
 			 * is required.
 			 */
 
 			IMPLY(lwb != NULL,
 			    lwb->lwb_state == LWB_STATE_CLOSED ||
 			    lwb->lwb_state == LWB_STATE_READY ||
 			    lwb->lwb_state == LWB_STATE_ISSUED ||
 			    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
 			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 			cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
 		}
 	}
 
 	mutex_exit(&zcw->zcw_lock);
 }
 
 static zil_commit_waiter_t *
 zil_alloc_commit_waiter(void)
 {
 	zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP);
 
 	cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_link_init(&zcw->zcw_node);
 	zcw->zcw_lwb = NULL;
 	zcw->zcw_done = B_FALSE;
 	zcw->zcw_zio_error = 0;
 
 	return (zcw);
 }
 
 static void
 zil_free_commit_waiter(zil_commit_waiter_t *zcw)
 {
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	ASSERT3B(zcw->zcw_done, ==, B_TRUE);
 	mutex_destroy(&zcw->zcw_lock);
 	cv_destroy(&zcw->zcw_cv);
 	kmem_cache_free(zil_zcw_cache, zcw);
 }
 
 /*
  * This function is used to create a TX_COMMIT itx and assign it. This
  * way, it will be linked into the ZIL's list of synchronous itxs, and
  * then later committed to an lwb (or skipped) when
  * zil_process_commit_list() is called.
  */
 static void
 zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
 
 	/*
 	 * Since we are not going to create any new dirty data, and we
 	 * can even help with clearing the existing dirty data, we
 	 * should not be subject to the dirty data based delays. We
 	 * use TXG_NOTHROTTLE to bypass the delay mechanism.
 	 */
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
 
 	itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
 	itx->itx_sync = B_TRUE;
 	itx->itx_private = zcw;
 
 	zil_itx_assign(zilog, itx, tx);
 
 	dmu_tx_commit(tx);
 }
 
 /*
  * Commit ZFS Intent Log transactions (itxs) to stable storage.
  *
  * When writing ZIL transactions to the on-disk representation of the
  * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple
  * itxs can be committed to a single lwb. Once a lwb is written and
  * committed to stable storage (i.e. the lwb is written, and vdevs have
  * been flushed), each itx that was committed to that lwb is also
  * considered to be committed to stable storage.
  *
  * When an itx is committed to an lwb, the log record (lr_t) contained
  * by the itx is copied into the lwb's zio buffer, and once this buffer
  * is written to disk, it becomes an on-disk ZIL block.
  *
  * As itxs are generated, they're inserted into the ZIL's queue of
  * uncommitted itxs. The semantics of zil_commit() are such that it will
  * block until all itxs that were in the queue when it was called, are
  * committed to stable storage.
  *
  * If "foid" is zero, this means all "synchronous" and "asynchronous"
  * itxs, for all objects in the dataset, will be committed to stable
  * storage prior to zil_commit() returning. If "foid" is non-zero, all
  * "synchronous" itxs for all objects, but only "asynchronous" itxs
  * that correspond to the foid passed in, will be committed to stable
  * storage prior to zil_commit() returning.
  *
  * Generally speaking, when zil_commit() is called, the consumer doesn't
  * actually care about _all_ of the uncommitted itxs. Instead, they're
  * simply trying to waiting for a specific itx to be committed to disk,
  * but the interface(s) for interacting with the ZIL don't allow such
  * fine-grained communication. A better interface would allow a consumer
  * to create and assign an itx, and then pass a reference to this itx to
  * zil_commit(); such that zil_commit() would return as soon as that
  * specific itx was committed to disk (instead of waiting for _all_
  * itxs to be committed).
  *
  * When a thread calls zil_commit() a special "commit itx" will be
  * generated, along with a corresponding "waiter" for this commit itx.
  * zil_commit() will wait on this waiter's CV, such that when the waiter
  * is marked done, and signaled, zil_commit() will return.
  *
  * This commit itx is inserted into the queue of uncommitted itxs. This
  * provides an easy mechanism for determining which itxs were in the
  * queue prior to zil_commit() having been called, and which itxs were
  * added after zil_commit() was called.
  *
  * The commit itx is special; it doesn't have any on-disk representation.
  * When a commit itx is "committed" to an lwb, the waiter associated
  * with it is linked onto the lwb's list of waiters. Then, when that lwb
  * completes, each waiter on the lwb's list is marked done and signaled
  * -- allowing the thread waiting on the waiter to return from zil_commit().
  *
  * It's important to point out a few critical factors that allow us
  * to make use of the commit itxs, commit waiters, per-lwb lists of
  * commit waiters, and zio completion callbacks like we're doing:
  *
  *   1. The list of waiters for each lwb is traversed, and each commit
  *      waiter is marked "done" and signaled, in the zio completion
  *      callback of the lwb's zio[*].
  *
  *      * Actually, the waiters are signaled in the zio completion
  *        callback of the root zio for the DKIOCFLUSHWRITECACHE commands
  *        that are sent to the vdevs upon completion of the lwb zio.
  *
  *   2. When the itxs are inserted into the ZIL's queue of uncommitted
  *      itxs, the order in which they are inserted is preserved[*]; as
  *      itxs are added to the queue, they are added to the tail of
  *      in-memory linked lists.
  *
  *      When committing the itxs to lwbs (to be written to disk), they
  *      are committed in the same order in which the itxs were added to
  *      the uncommitted queue's linked list(s); i.e. the linked list of
  *      itxs to commit is traversed from head to tail, and each itx is
  *      committed to an lwb in that order.
  *
  *      * To clarify:
  *
  *        - the order of "sync" itxs is preserved w.r.t. other
  *          "sync" itxs, regardless of the corresponding objects.
  *        - the order of "async" itxs is preserved w.r.t. other
  *          "async" itxs corresponding to the same object.
  *        - the order of "async" itxs is *not* preserved w.r.t. other
  *          "async" itxs corresponding to different objects.
  *        - the order of "sync" itxs w.r.t. "async" itxs (or vice
  *          versa) is *not* preserved, even for itxs that correspond
  *          to the same object.
  *
  *      For more details, see: zil_itx_assign(), zil_async_to_sync(),
  *      zil_get_commit_list(), and zil_process_commit_list().
  *
  *   3. The lwbs represent a linked list of blocks on disk. Thus, any
  *      lwb cannot be considered committed to stable storage, until its
  *      "previous" lwb is also committed to stable storage. This fact,
  *      coupled with the fact described above, means that itxs are
  *      committed in (roughly) the order in which they were generated.
  *      This is essential because itxs are dependent on prior itxs.
  *      Thus, we *must not* deem an itx as being committed to stable
  *      storage, until *all* prior itxs have also been committed to
  *      stable storage.
  *
  *      To enforce this ordering of lwb zio's, while still leveraging as
  *      much of the underlying storage performance as possible, we rely
  *      on two fundamental concepts:
  *
  *          1. The creation and issuance of lwb zio's is protected by
  *             the zilog's "zl_issuer_lock", which ensures only a single
  *             thread is creating and/or issuing lwb's at a time
  *          2. The "previous" lwb is a child of the "current" lwb
  *             (leveraging the zio parent-child dependency graph)
  *
  *      By relying on this parent-child zio relationship, we can have
  *      many lwb zio's concurrently issued to the underlying storage,
  *      but the order in which they complete will be the same order in
  *      which they were created.
  */
 void
 zil_commit(zilog_t *zilog, uint64_t foid)
 {
 	/*
 	 * We should never attempt to call zil_commit on a snapshot for
 	 * a couple of reasons:
 	 *
 	 * 1. A snapshot may never be modified, thus it cannot have any
 	 *    in-flight itxs that would have modified the dataset.
 	 *
 	 * 2. By design, when zil_commit() is called, a commit itx will
 	 *    be assigned to this zilog; as a result, the zilog will be
 	 *    dirtied. We must not dirty the zilog of a snapshot; there's
 	 *    checks in the code that enforce this invariant, and will
 	 *    cause a panic if it's not upheld.
 	 */
 	ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE);
 
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return;
 
 	if (!spa_writeable(zilog->zl_spa)) {
 		/*
 		 * If the SPA is not writable, there should never be any
 		 * pending itxs waiting to be committed to disk. If that
 		 * weren't true, we'd skip writing those itxs out, and
 		 * would break the semantics of zil_commit(); thus, we're
 		 * verifying that truth before we return to the caller.
 		 */
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
 		for (int i = 0; i < TXG_SIZE; i++)
 			ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL);
 		return;
 	}
 
 	/*
 	 * If the ZIL is suspended, we don't want to dirty it by calling
 	 * zil_commit_itx_assign() below, nor can we write out
 	 * lwbs like would be done in zil_commit_write(). Thus, we
 	 * simply rely on txg_wait_synced() to maintain the necessary
 	 * semantics, and avoid calling those functions altogether.
 	 */
 	if (zilog->zl_suspend > 0) {
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
 		return;
 	}
 
 	zil_commit_impl(zilog, foid);
 }
 
 void
 zil_commit_impl(zilog_t *zilog, uint64_t foid)
 {
 	ZIL_STAT_BUMP(zilog, zil_commit_count);
 
 	/*
 	 * Move the "async" itxs for the specified foid to the "sync"
 	 * queues, such that they will be later committed (or skipped)
 	 * to an lwb when zil_process_commit_list() is called.
 	 *
 	 * Since these "async" itxs must be committed prior to this
 	 * call to zil_commit returning, we must perform this operation
 	 * before we call zil_commit_itx_assign().
 	 */
 	zil_async_to_sync(zilog, foid);
 
 	/*
 	 * We allocate a new "waiter" structure which will initially be
 	 * linked to the commit itx using the itx's "itx_private" field.
 	 * Since the commit itx doesn't represent any on-disk state,
 	 * when it's committed to an lwb, rather than copying the its
 	 * lr_t into the lwb's buffer, the commit itx's "waiter" will be
 	 * added to the lwb's list of waiters. Then, when the lwb is
 	 * committed to stable storage, each waiter in the lwb's list of
 	 * waiters will be marked "done", and signalled.
 	 *
 	 * We must create the waiter and assign the commit itx prior to
 	 * calling zil_commit_writer(), or else our specific commit itx
 	 * is not guaranteed to be committed to an lwb prior to calling
 	 * zil_commit_waiter().
 	 */
 	zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
 	zil_commit_itx_assign(zilog, zcw);
 
 	uint64_t wtxg = zil_commit_writer(zilog, zcw);
 	zil_commit_waiter(zilog, zcw);
 
 	if (zcw->zcw_zio_error != 0) {
 		/*
 		 * If there was an error writing out the ZIL blocks that
 		 * this thread is waiting on, then we fallback to
 		 * relying on spa_sync() to write out the data this
 		 * thread is waiting on. Obviously this has performance
 		 * implications, but the expectation is for this to be
 		 * an exceptional case, and shouldn't occur often.
 		 */
 		DTRACE_PROBE2(zil__commit__io__error,
 		    zilog_t *, zilog, zil_commit_waiter_t *, zcw);
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
 	} else if (wtxg != 0) {
 		txg_wait_synced(zilog->zl_dmu_pool, wtxg);
 	}
 
 	zil_free_commit_waiter(zcw);
 }
 
 /*
  * Called in syncing context to free committed log blocks and update log header.
  */
 void
 zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	zil_header_t *zh = zil_header_in_syncing_context(zilog);
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = zilog->zl_spa;
 	uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
 	lwb_t *lwb;
 
 	/*
 	 * We don't zero out zl_destroy_txg, so make sure we don't try
 	 * to destroy it twice.
 	 */
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	zil_lwb_flush_wait_all(zilog, txg);
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT(zilog->zl_stop_sync == 0);
 
 	if (*replayed_seq != 0) {
 		ASSERT(zh->zh_replay_seq < *replayed_seq);
 		zh->zh_replay_seq = *replayed_seq;
 		*replayed_seq = 0;
 	}
 
 	if (zilog->zl_destroy_txg == txg) {
 		blkptr_t blk = zh->zh_log;
 		dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 		memset(zh, 0, sizeof (zil_header_t));
 		memset(zilog->zl_replayed_seq, 0,
 		    sizeof (zilog->zl_replayed_seq));
 
 		if (zilog->zl_keep_first) {
 			/*
 			 * If this block was part of log chain that couldn't
 			 * be claimed because a device was missing during
 			 * zil_claim(), but that device later returns,
 			 * then this block could erroneously appear valid.
 			 * To guard against this, assign a new GUID to the new
 			 * log chain so it doesn't matter what blk points to.
 			 */
 			zil_init_log_chain(zilog, &blk);
 			zh->zh_log = blk;
 		} else {
 			/*
 			 * A destroyed ZIL chain can't contain any TX_SETSAXATTR
 			 * records. So, deactivate the feature for this dataset.
 			 * We activate it again when we start a new ZIL chain.
 			 */
 			if (dsl_dataset_feature_is_active(ds,
 			    SPA_FEATURE_ZILSAXATTR))
 				dsl_dataset_deactivate_feature(ds,
 				    SPA_FEATURE_ZILSAXATTR, tx);
 		}
 	}
 
 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 		zh->zh_log = lwb->lwb_blk;
 		if (lwb->lwb_state != LWB_STATE_FLUSH_DONE ||
 		    lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg)
 			break;
 		list_remove(&zilog->zl_lwb_list, lwb);
 		if (!BP_IS_HOLE(&lwb->lwb_blk))
 			zio_free(spa, txg, &lwb->lwb_blk);
 		zil_free_lwb(zilog, lwb);
 
 		/*
 		 * If we don't have anything left in the lwb list then
 		 * we've had an allocation failure and we need to zero
 		 * out the zil_header blkptr so that we don't end
 		 * up freeing the same block twice.
 		 */
 		if (list_is_empty(&zilog->zl_lwb_list))
 			BP_ZERO(&zh->zh_log);
 	}
 
 	mutex_exit(&zilog->zl_lock);
 }
 
 static int
 zil_lwb_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	lwb_t *lwb = vbuf;
 	list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
 	list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
 	    offsetof(zil_commit_waiter_t, zcw_node));
 	avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
 	    sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
 	mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
 	return (0);
 }
 
 static void
 zil_lwb_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	lwb_t *lwb = vbuf;
 	mutex_destroy(&lwb->lwb_vdev_lock);
 	avl_destroy(&lwb->lwb_vdev_tree);
 	list_destroy(&lwb->lwb_waiters);
 	list_destroy(&lwb->lwb_itxs);
 }
 
 void
 zil_init(void)
 {
 	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
 	    sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0);
 
 	zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
 	    sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	zil_sums_init(&zil_sums_global);
 	zil_kstats_global = kstat_create("zfs", 0, "zil", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 
 	if (zil_kstats_global != NULL) {
 		zil_kstats_global->ks_data = &zil_stats;
 		zil_kstats_global->ks_update = zil_kstats_global_update;
 		zil_kstats_global->ks_private = NULL;
 		kstat_install(zil_kstats_global);
 	}
 }
 
 void
 zil_fini(void)
 {
 	kmem_cache_destroy(zil_zcw_cache);
 	kmem_cache_destroy(zil_lwb_cache);
 
 	if (zil_kstats_global != NULL) {
 		kstat_delete(zil_kstats_global);
 		zil_kstats_global = NULL;
 	}
 
 	zil_sums_fini(&zil_sums_global);
 }
 
 void
 zil_set_sync(zilog_t *zilog, uint64_t sync)
 {
 	zilog->zl_sync = sync;
 }
 
 void
 zil_set_logbias(zilog_t *zilog, uint64_t logbias)
 {
 	zilog->zl_logbias = logbias;
 }
 
 zilog_t *
 zil_alloc(objset_t *os, zil_header_t *zh_phys)
 {
 	zilog_t *zilog;
 
 	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
 
 	zilog->zl_header = zh_phys;
 	zilog->zl_os = os;
 	zilog->zl_spa = dmu_objset_spa(os);
 	zilog->zl_dmu_pool = dmu_objset_pool(os);
 	zilog->zl_destroy_txg = TXG_INITIAL - 1;
 	zilog->zl_logbias = dmu_objset_logbias(os);
 	zilog->zl_sync = dmu_objset_syncprop(os);
 	zilog->zl_dirty_max_txg = 0;
 	zilog->zl_last_lwb_opened = NULL;
 	zilog->zl_last_lwb_latency = 0;
 	zilog->zl_max_block_size = MIN(MAX(P2ALIGN_TYPED(zil_maxblocksize,
 	    ZIL_MIN_BLKSZ, uint64_t), ZIL_MIN_BLKSZ),
 	    spa_maxblocksize(dmu_objset_spa(os)));
 
 	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zilog->zl_lwb_io_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
 		    MUTEX_DEFAULT, NULL);
 	}
 
 	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
 	    offsetof(lwb_t, lwb_node));
 
 	list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
 	    offsetof(itx_t, itx_node));
 
 	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
 	cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL);
 
 	for (int i = 0; i < ZIL_BURSTS; i++) {
 		zilog->zl_prev_opt[i] = zilog->zl_max_block_size -
 		    sizeof (zil_chain_t);
 	}
 
 	return (zilog);
 }
 
 void
 zil_free(zilog_t *zilog)
 {
 	int i;
 
 	zilog->zl_stop_sync = 1;
 
 	ASSERT0(zilog->zl_suspend);
 	ASSERT0(zilog->zl_suspending);
 
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	list_destroy(&zilog->zl_lwb_list);
 
 	ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
 	list_destroy(&zilog->zl_itx_commit_list);
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		/*
 		 * It's possible for an itx to be generated that doesn't dirty
 		 * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
 		 * callback to remove the entry. We remove those here.
 		 *
 		 * Also free up the ziltest itxs.
 		 */
 		if (zilog->zl_itxg[i].itxg_itxs)
 			zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
 		mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
 	}
 
 	mutex_destroy(&zilog->zl_issuer_lock);
 	mutex_destroy(&zilog->zl_lock);
 	mutex_destroy(&zilog->zl_lwb_io_lock);
 
 	cv_destroy(&zilog->zl_cv_suspend);
 	cv_destroy(&zilog->zl_lwb_io_cv);
 
 	kmem_free(zilog, sizeof (zilog_t));
 }
 
 /*
  * Open an intent log.
  */
 zilog_t *
 zil_open(objset_t *os, zil_get_data_t *get_data, zil_sums_t *zil_sums)
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 
 	ASSERT3P(zilog->zl_get_data, ==, NULL);
 	ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 	zilog->zl_get_data = get_data;
 	zilog->zl_sums = zil_sums;
 
 	return (zilog);
 }
 
 /*
  * Close an intent log.
  */
 void
 zil_close(zilog_t *zilog)
 {
 	lwb_t *lwb;
 	uint64_t txg;
 
 	if (!dmu_objset_is_snapshot(zilog->zl_os)) {
 		zil_commit(zilog, 0);
 	} else {
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT0(zilog->zl_dirty_max_txg);
 		ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
 	}
 
 	mutex_enter(&zilog->zl_lock);
 	txg = zilog->zl_dirty_max_txg;
 	lwb = list_tail(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
 		txg = MAX(txg, lwb->lwb_alloc_txg);
 		txg = MAX(txg, lwb->lwb_max_txg);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends
 	 * on the time when the dmu_tx transaction is assigned in
 	 * zil_lwb_write_issue().
 	 */
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	txg = MAX(zilog->zl_lwb_max_issued_txg, txg);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 
 	/*
 	 * We need to use txg_wait_synced() to wait until that txg is synced.
 	 * zil_sync() will guarantee all lwbs up to that txg have been
 	 * written out, flushed, and cleaned.
 	 */
 	if (txg != 0)
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 
 	if (zilog_is_dirty(zilog))
 		zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog,
 		    (u_longlong_t)txg);
 	if (txg < spa_freeze_txg(zilog->zl_spa))
 		VERIFY(!zilog_is_dirty(zilog));
 
 	zilog->zl_get_data = NULL;
 
 	/*
 	 * We should have only one lwb left on the list; remove it now.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	lwb = list_remove_head(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW);
 		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 		zil_free_lwb(zilog, lwb);
 	}
 	mutex_exit(&zilog->zl_lock);
 }
 
 static const char *suspend_tag = "zil suspending";
 
 /*
  * Suspend an intent log.  While in suspended mode, we still honor
  * synchronous semantics, but we rely on txg_wait_synced() to do it.
  * On old version pools, we suspend the log briefly when taking a
  * snapshot so that it will have an empty intent log.
  *
  * Long holds are not really intended to be used the way we do here --
  * held for such a short time.  A concurrent caller of dsl_dataset_long_held()
  * could fail.  Therefore we take pains to only put a long hold if it is
  * actually necessary.  Fortunately, it will only be necessary if the
  * objset is currently mounted (or the ZVOL equivalent).  In that case it
  * will already have a long hold, so we are not really making things any worse.
  *
  * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
  * zvol_state_t), and use their mechanism to prevent their hold from being
  * dropped (e.g. VFS_HOLD()).  However, that would be even more pain for
  * very little gain.
  *
  * if cookiep == NULL, this does both the suspend & resume.
  * Otherwise, it returns with the dataset "long held", and the cookie
  * should be passed into zil_resume().
  */
 int
 zil_suspend(const char *osname, void **cookiep)
 {
 	objset_t *os;
 	zilog_t *zilog;
 	const zil_header_t *zh;
 	int error;
 
 	error = dmu_objset_hold(osname, suspend_tag, &os);
 	if (error != 0)
 		return (error);
 	zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	zh = zilog->zl_header;
 
 	if (zh->zh_flags & ZIL_REPLAY_NEEDED) {		/* unplayed log */
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (SET_ERROR(EBUSY));
 	}
 
 	/*
 	 * Don't put a long hold in the cases where we can avoid it.  This
 	 * is when there is no cookie so we are doing a suspend & resume
 	 * (i.e. called from zil_vdev_offline()), and there's nothing to do
 	 * for the suspend because it's already suspended, or there's no ZIL.
 	 */
 	if (cookiep == NULL && !zilog->zl_suspending &&
 	    (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (0);
 	}
 
 	dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
 	dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
 
 	zilog->zl_suspend++;
 
 	if (zilog->zl_suspend > 1) {
 		/*
 		 * Someone else is already suspending it.
 		 * Just wait for them to finish.
 		 */
 
 		while (zilog->zl_suspending)
 			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
 		mutex_exit(&zilog->zl_lock);
 
 		if (cookiep == NULL)
 			zil_resume(os);
 		else
 			*cookiep = os;
 		return (0);
 	}
 
 	/*
 	 * If there is no pointer to an on-disk block, this ZIL must not
 	 * be active (e.g. filesystem not mounted), so there's nothing
 	 * to clean up.
 	 */
 	if (BP_IS_HOLE(&zh->zh_log)) {
 		ASSERT(cookiep != NULL); /* fast path already handled */
 
 		*cookiep = os;
 		mutex_exit(&zilog->zl_lock);
 		return (0);
 	}
 
 	/*
 	 * The ZIL has work to do. Ensure that the associated encryption
 	 * key will remain mapped while we are committing the log by
 	 * grabbing a reference to it. If the key isn't loaded we have no
 	 * choice but to return an error until the wrapping key is loaded.
 	 */
 	if (os->os_encrypted &&
 	    dsl_dataset_create_key_mapping(dmu_objset_ds(os)) != 0) {
 		zilog->zl_suspend--;
 		mutex_exit(&zilog->zl_lock);
 		dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
 		dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 		return (SET_ERROR(EACCES));
 	}
 
 	zilog->zl_suspending = B_TRUE;
 	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * We need to use zil_commit_impl to ensure we wait for all
 	 * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed
 	 * to disk before proceeding. If we used zil_commit instead, it
 	 * would just call txg_wait_synced(), because zl_suspend is set.
 	 * txg_wait_synced() doesn't wait for these lwb's to be
 	 * LWB_STATE_FLUSH_DONE before returning.
 	 */
 	zil_commit_impl(zilog, 0);
 
 	/*
 	 * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
 	 * use txg_wait_synced() to ensure the data from the zilog has
 	 * migrated to the main pool before calling zil_destroy().
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zil_destroy(zilog, B_FALSE);
 
 	mutex_enter(&zilog->zl_lock);
 	zilog->zl_suspending = B_FALSE;
 	cv_broadcast(&zilog->zl_cv_suspend);
 	mutex_exit(&zilog->zl_lock);
 
 	if (os->os_encrypted)
 		dsl_dataset_remove_key_mapping(dmu_objset_ds(os));
 
 	if (cookiep == NULL)
 		zil_resume(os);
 	else
 		*cookiep = os;
 	return (0);
 }
 
 void
 zil_resume(void *cookie)
 {
 	objset_t *os = cookie;
 	zilog_t *zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT(zilog->zl_suspend != 0);
 	zilog->zl_suspend--;
 	mutex_exit(&zilog->zl_lock);
 	dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
 	dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 }
 
 typedef struct zil_replay_arg {
 	zil_replay_func_t *const *zr_replay;
 	void		*zr_arg;
 	boolean_t	zr_byteswap;
 	char		*zr_lr;
 } zil_replay_arg_t;
 
 static int
 zil_replay_error(zilog_t *zilog, const lr_t *lr, int error)
 {
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 
 	zilog->zl_replaying_seq--;	/* didn't actually replay this one */
 
 	dmu_objset_name(zilog->zl_os, name);
 
 	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
 	    "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
 	    (u_longlong_t)lr->lrc_seq,
 	    (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
 	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
 
 	return (error);
 }
 
 static int
 zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra,
     uint64_t claim_txg)
 {
 	zil_replay_arg_t *zr = zra;
 	const zil_header_t *zh = zilog->zl_header;
 	uint64_t reclen = lr->lrc_reclen;
 	uint64_t txtype = lr->lrc_txtype;
 	int error = 0;
 
 	zilog->zl_replaying_seq = lr->lrc_seq;
 
 	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
 		return (0);
 
 	if (lr->lrc_txg < claim_txg)		/* already committed */
 		return (0);
 
 	/* Strip case-insensitive bit, still present in log record */
 	txtype &= ~TX_CI;
 
 	if (txtype == 0 || txtype >= TX_MAX_TYPE)
 		return (zil_replay_error(zilog, lr, EINVAL));
 
 	/*
 	 * If this record type can be logged out of order, the object
 	 * (lr_foid) may no longer exist.  That's legitimate, not an error.
 	 */
 	if (TX_OOO(txtype)) {
 		error = dmu_object_info(zilog->zl_os,
 		    LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL);
 		if (error == ENOENT || error == EEXIST)
 			return (0);
 	}
 
 	/*
 	 * Make a copy of the data so we can revise and extend it.
 	 */
 	memcpy(zr->zr_lr, lr, reclen);
 
 	/*
 	 * If this is a TX_WRITE with a blkptr, suck in the data.
 	 */
 	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
 		error = zil_read_log_data(zilog, (lr_write_t *)lr,
 		    zr->zr_lr + reclen);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 
 	/*
 	 * The log block containing this lr may have been byteswapped
 	 * so that we can easily examine common fields like lrc_txtype.
 	 * However, the log is a mix of different record types, and only the
 	 * replay vectors know how to byteswap their records.  Therefore, if
 	 * the lr was byteswapped, undo it before invoking the replay vector.
 	 */
 	if (zr->zr_byteswap)
 		byteswap_uint64_array(zr->zr_lr, reclen);
 
 	/*
 	 * We must now do two things atomically: replay this log record,
 	 * and update the log header sequence number to reflect the fact that
 	 * we did so. At the end of each replay function the sequence number
 	 * is updated if we are in replay mode.
 	 */
 	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
 	if (error != 0) {
 		/*
 		 * The DMU's dnode layer doesn't see removes until the txg
 		 * commits, so a subsequent claim can spuriously fail with
 		 * EEXIST. So if we receive any error we try syncing out
 		 * any removes then retry the transaction.  Note that we
 		 * specify B_FALSE for byteswap now, so we don't do it twice.
 		 */
 		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
 		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 	return (0);
 }
 
 static int
 zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	(void) bp, (void) arg, (void) claim_txg;
 
 	zilog->zl_replay_blks++;
 
 	return (0);
 }
 
 /*
  * If this dataset has a non-empty intent log, replay it and destroy it.
  * Return B_TRUE if there were any entries to replay.
  */
 boolean_t
 zil_replay(objset_t *os, void *arg,
     zil_replay_func_t *const replay_func[TX_MAX_TYPE])
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 	const zil_header_t *zh = zilog->zl_header;
 	zil_replay_arg_t zr;
 
 	if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
 		return (zil_destroy(zilog, B_TRUE));
 	}
 
 	zr.zr_replay = replay_func;
 	zr.zr_arg = arg;
 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
 	zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
 
 	/*
 	 * Wait for in-progress removes to sync before starting replay.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zilog->zl_replay = B_TRUE;
 	zilog->zl_replay_time = ddi_get_lbolt();
 	ASSERT(zilog->zl_replay_blks == 0);
 	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
 	    zh->zh_claim_txg, B_TRUE);
 	vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
 
 	zil_destroy(zilog, B_FALSE);
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 	zilog->zl_replay = B_FALSE;
 
 	return (B_TRUE);
 }
 
 boolean_t
 zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
 {
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return (B_TRUE);
 
 	if (zilog->zl_replay) {
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
 		    zilog->zl_replaying_seq;
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 int
 zil_reset(const char *osname, void *arg)
 {
 	(void) arg;
 
 	int error = zil_suspend(osname, NULL);
 	/* EACCES means crypto key not loaded */
 	if ((error == EACCES) || (error == EBUSY))
 		return (SET_ERROR(error));
 	if (error != 0)
 		return (SET_ERROR(EEXIST));
 	return (0);
 }
 
 EXPORT_SYMBOL(zil_alloc);
 EXPORT_SYMBOL(zil_free);
 EXPORT_SYMBOL(zil_open);
 EXPORT_SYMBOL(zil_close);
 EXPORT_SYMBOL(zil_replay);
 EXPORT_SYMBOL(zil_replaying);
 EXPORT_SYMBOL(zil_destroy);
 EXPORT_SYMBOL(zil_destroy_sync);
 EXPORT_SYMBOL(zil_itx_create);
 EXPORT_SYMBOL(zil_itx_destroy);
 EXPORT_SYMBOL(zil_itx_assign);
 EXPORT_SYMBOL(zil_commit);
 EXPORT_SYMBOL(zil_claim);
 EXPORT_SYMBOL(zil_check_log_chain);
 EXPORT_SYMBOL(zil_sync);
 EXPORT_SYMBOL(zil_clean);
 EXPORT_SYMBOL(zil_suspend);
 EXPORT_SYMBOL(zil_resume);
 EXPORT_SYMBOL(zil_lwb_add_block);
 EXPORT_SYMBOL(zil_bp_tree_add);
 EXPORT_SYMBOL(zil_set_sync);
 EXPORT_SYMBOL(zil_set_logbias);
 EXPORT_SYMBOL(zil_sums_init);
 EXPORT_SYMBOL(zil_sums_fini);
 EXPORT_SYMBOL(zil_kstat_values_update);
 
 ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
 	"ZIL block open timeout percentage");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
 	"Disable intent logging replay");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW,
 	"Disable ZIL cache flushes");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW,
 	"Limit in bytes slog sync writes per commit");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW,
 	"Limit in bytes of ZIL log block size");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW,
 	"Limit in bytes WR_COPIED size");
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 213fe5c483f2..e96bbda35a04 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1,5253 +1,5256 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2022 by Delphix. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2021, Datto, Inc.
  */
 
 #include <sys/sysmacros.h>
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_trim.h>
 #include <sys/zio_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
 #include <sys/dsl_scan.h>
 #include <sys/metaslab_impl.h>
 #include <sys/time.h>
 #include <sys/trace_zfs.h>
 #include <sys/abd.h>
 #include <sys/dsl_crypt.h>
 #include <cityhash.h>
 
 /*
  * ==========================================================================
  * I/O type descriptions
  * ==========================================================================
  */
 const char *const zio_type_name[ZIO_TYPES] = {
 	/*
 	 * Note: Linux kernel thread name length is limited
 	 * so these names will differ from upstream open zfs.
 	 */
 	"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim"
 };
 
 int zio_dva_throttle_enabled = B_TRUE;
 static int zio_deadman_log_all = B_FALSE;
 
 /*
  * ==========================================================================
  * I/O kmem caches
  * ==========================================================================
  */
 static kmem_cache_t *zio_cache;
 static kmem_cache_t *zio_link_cache;
 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 static uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 static uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 #endif
 
 /* Mark IOs as "slow" if they take longer than 30 seconds */
 static uint_t zio_slow_io_ms = (30 * MILLISEC);
 
 #define	BP_SPANB(indblkshift, level) \
 	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
 #define	COMPARE_META_LEVEL	0x80000000ul
 /*
  * The following actions directly effect the spa's sync-to-convergence logic.
  * The values below define the sync pass when we start performing the action.
  * Care should be taken when changing these values as they directly impact
  * spa_sync() performance. Tuning these values may introduce subtle performance
  * pathologies and should only be done in the context of performance analysis.
  * These tunables will eventually be removed and replaced with #defines once
  * enough analysis has been done to determine optimal values.
  *
  * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  * regular blocks are not deferred.
  *
  * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable
  * compression (including of metadata).  In practice, we don't have this
  * many sync passes, so this has no effect.
  *
  * The original intent was that disabling compression would help the sync
  * passes to converge. However, in practice disabling compression increases
  * the average number of sync passes, because when we turn compression off, a
  * lot of block's size will change and thus we have to re-allocate (not
  * overwrite) them. It also increases the number of 128KB allocations (e.g.
  * for indirect blocks and spacemaps) because these will not be compressed.
  * The 128K allocations are especially detrimental to performance on highly
  * fragmented systems, which may have very few free segments of this size,
  * and may need to load new metaslabs to satisfy 128K allocations.
  */
 
 /* defer frees starting in this pass */
 uint_t zfs_sync_pass_deferred_free = 2;
 
 /* don't compress starting in this pass */
 static uint_t zfs_sync_pass_dont_compress = 8;
 
 /* rewrite new bps starting in this pass */
 static uint_t zfs_sync_pass_rewrite = 2;
 
 /*
  * An allocating zio is one that either currently has the DVA allocate
  * stage set or will have it later in its lifetime.
  */
 #define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
 
 /*
  * Enable smaller cores by excluding metadata
  * allocations as well.
  */
 int zio_exclude_metadata = 0;
 static int zio_requeue_io_start_cut_in_line = 1;
 
 #ifdef ZFS_DEBUG
 static const int zio_buf_debug_limit = 16384;
 #else
 static const int zio_buf_debug_limit = 0;
 #endif
 
 static inline void __zio_execute(zio_t *zio);
 
 static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
 
 void
 zio_init(void)
 {
 	size_t c;
 
 	zio_cache = kmem_cache_create("zio_cache",
 	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	zio_link_cache = kmem_cache_create("zio_link_cache",
 	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 		size_t align, cflags, data_cflags;
 		char name[32];
 
 		/*
 		 * Create cache for each half-power of 2 size, starting from
 		 * SPA_MINBLOCKSIZE.  It should give us memory space efficiency
 		 * of ~7/8, sufficient for transient allocations mostly using
 		 * these caches.
 		 */
 		size_t p2 = size;
 		while (!ISP2(p2))
 			p2 &= p2 - 1;
 		if (!IS_P2ALIGNED(size, p2 / 2))
 			continue;
 
 #ifndef _KERNEL
 		/*
 		 * If we are using watchpoints, put each buffer on its own page,
 		 * to eliminate the performance overhead of trapping to the
 		 * kernel when modifying a non-watched buffer that shares the
 		 * page with a watched buffer.
 		 */
 		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 			continue;
 #endif
 
 		if (IS_P2ALIGNED(size, PAGESIZE))
 			align = PAGESIZE;
 		else
 			align = 1 << (highbit64(size ^ (size - 1)) - 1);
 
 		cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
 		    KMC_NODEBUG : 0;
 		data_cflags = KMC_NODEBUG;
 		if (cflags == data_cflags) {
 			/*
 			 * Resulting kmem caches would be identical.
 			 * Save memory by creating only one.
 			 */
 			(void) snprintf(name, sizeof (name),
 			    "zio_buf_comb_%lu", (ulong_t)size);
 			zio_buf_cache[c] = kmem_cache_create(name, size, align,
 			    NULL, NULL, NULL, NULL, NULL, cflags);
 			zio_data_buf_cache[c] = zio_buf_cache[c];
 			continue;
 		}
 		(void) snprintf(name, sizeof (name), "zio_buf_%lu",
 		    (ulong_t)size);
 		zio_buf_cache[c] = kmem_cache_create(name, size, align,
 		    NULL, NULL, NULL, NULL, NULL, cflags);
 
 		(void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
 		    (ulong_t)size);
 		zio_data_buf_cache[c] = kmem_cache_create(name, size, align,
 		    NULL, NULL, NULL, NULL, NULL, data_cflags);
 	}
 
 	while (--c != 0) {
 		ASSERT(zio_buf_cache[c] != NULL);
 		if (zio_buf_cache[c - 1] == NULL)
 			zio_buf_cache[c - 1] = zio_buf_cache[c];
 
 		ASSERT(zio_data_buf_cache[c] != NULL);
 		if (zio_data_buf_cache[c - 1] == NULL)
 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 	}
 
 	zio_inject_init();
 
 	lz4_init();
 }
 
 void
 zio_fini(void)
 {
 	size_t n = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT;
 
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 	for (size_t i = 0; i < n; i++) {
 		if (zio_buf_cache_allocs[i] != zio_buf_cache_frees[i])
 			(void) printf("zio_fini: [%d] %llu != %llu\n",
 			    (int)((i + 1) << SPA_MINBLOCKSHIFT),
 			    (long long unsigned)zio_buf_cache_allocs[i],
 			    (long long unsigned)zio_buf_cache_frees[i]);
 	}
 #endif
 
 	/*
 	 * The same kmem cache can show up multiple times in both zio_buf_cache
 	 * and zio_data_buf_cache. Do a wasteful but trivially correct scan to
 	 * sort it out.
 	 */
 	for (size_t i = 0; i < n; i++) {
 		kmem_cache_t *cache = zio_buf_cache[i];
 		if (cache == NULL)
 			continue;
 		for (size_t j = i; j < n; j++) {
 			if (cache == zio_buf_cache[j])
 				zio_buf_cache[j] = NULL;
 			if (cache == zio_data_buf_cache[j])
 				zio_data_buf_cache[j] = NULL;
 		}
 		kmem_cache_destroy(cache);
 	}
 
 	for (size_t i = 0; i < n; i++) {
 		kmem_cache_t *cache = zio_data_buf_cache[i];
 		if (cache == NULL)
 			continue;
 		for (size_t j = i; j < n; j++) {
 			if (cache == zio_data_buf_cache[j])
 				zio_data_buf_cache[j] = NULL;
 		}
 		kmem_cache_destroy(cache);
 	}
 
 	for (size_t i = 0; i < n; i++) {
 		VERIFY3P(zio_buf_cache[i], ==, NULL);
 		VERIFY3P(zio_data_buf_cache[i], ==, NULL);
 	}
 
 	kmem_cache_destroy(zio_link_cache);
 	kmem_cache_destroy(zio_cache);
 
 	zio_inject_fini();
 
 	lz4_fini();
 }
 
 /*
  * ==========================================================================
  * Allocate and free I/O buffers
  * ==========================================================================
  */
 
 #ifdef ZFS_DEBUG
 static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
 #endif
 
 /*
  * Use empty space after the buffer to detect overflows.
  *
  * Since zio_init() creates kmem caches only for certain set of buffer sizes,
  * allocations of different sizes may have some unused space after the data.
  * Filling part of that space with a known pattern on allocation and checking
  * it on free should allow us to detect some buffer overflows.
  */
 static void
 zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 {
 #ifdef ZFS_DEBUG
 	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
 	ulong_t *canary = p + off / sizeof (ulong_t);
 	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
 	if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT &&
 	    cache[c] == cache[c + 1])
 		asize = (c + 2) << SPA_MINBLOCKSHIFT;
 	for (; off < asize; canary++, off += sizeof (ulong_t))
 		*canary = zio_buf_canary;
 #endif
 }
 
 static void
 zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 {
 #ifdef ZFS_DEBUG
 	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
 	ulong_t *canary = p + off / sizeof (ulong_t);
 	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
 	if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT &&
 	    cache[c] == cache[c + 1])
 		asize = (c + 2) << SPA_MINBLOCKSHIFT;
 	for (; off < asize; canary++, off += sizeof (ulong_t)) {
 		if (unlikely(*canary != zio_buf_canary)) {
 			PANIC("ZIO buffer overflow %p (%zu) + %zu %#lx != %#lx",
 			    p, size, (canary - p) * sizeof (ulong_t),
 			    *canary, zio_buf_canary);
 		}
 	}
 #endif
 }
 
 /*
  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
  * excess / transient data in-core during a crashdump.
  */
 void *
 zio_buf_alloc(size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 	atomic_add_64(&zio_buf_cache_allocs[c], 1);
 #endif
 
 	void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE);
 	zio_buf_put_canary(p, size, zio_buf_cache, c);
 	return (p);
 }
 
 /*
  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
  * crashdump if the kernel panics.  This exists so that we will limit the amount
  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
  * of kernel heap dumped to disk when the kernel panics)
  */
 void *
 zio_data_buf_alloc(size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE);
 	zio_buf_put_canary(p, size, zio_data_buf_cache, c);
 	return (p);
 }
 
 void
 zio_buf_free(void *buf, size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 	atomic_add_64(&zio_buf_cache_frees[c], 1);
 #endif
 
 	zio_buf_check_canary(buf, size, zio_buf_cache, c);
 	kmem_cache_free(zio_buf_cache[c], buf);
 }
 
 void
 zio_data_buf_free(void *buf, size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	zio_buf_check_canary(buf, size, zio_data_buf_cache, c);
 	kmem_cache_free(zio_data_buf_cache[c], buf);
 }
 
 static void
 zio_abd_free(void *abd, size_t size)
 {
 	(void) size;
 	abd_free((abd_t *)abd);
 }
 
 /*
  * ==========================================================================
  * Push and pop I/O transform buffers
  * ==========================================================================
  */
 void
 zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
     zio_transform_func_t *transform)
 {
 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 
 	zt->zt_orig_abd = zio->io_abd;
 	zt->zt_orig_size = zio->io_size;
 	zt->zt_bufsize = bufsize;
 	zt->zt_transform = transform;
 
 	zt->zt_next = zio->io_transform_stack;
 	zio->io_transform_stack = zt;
 
 	zio->io_abd = data;
 	zio->io_size = size;
 }
 
 void
 zio_pop_transforms(zio_t *zio)
 {
 	zio_transform_t *zt;
 
 	while ((zt = zio->io_transform_stack) != NULL) {
 		if (zt->zt_transform != NULL)
 			zt->zt_transform(zio,
 			    zt->zt_orig_abd, zt->zt_orig_size);
 
 		if (zt->zt_bufsize != 0)
 			abd_free(zio->io_abd);
 
 		zio->io_abd = zt->zt_orig_abd;
 		zio->io_size = zt->zt_orig_size;
 		zio->io_transform_stack = zt->zt_next;
 
 		kmem_free(zt, sizeof (zio_transform_t));
 	}
 }
 
 /*
  * ==========================================================================
  * I/O transform callbacks for subblocks, decompression, and decryption
  * ==========================================================================
  */
 static void
 zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
 {
 	ASSERT(zio->io_size > size);
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		abd_copy(data, zio->io_abd, size);
 }
 
 static void
 zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
 {
 	if (zio->io_error == 0) {
 		void *tmp = abd_borrow_buf(data, size);
 		int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 		    zio->io_abd, tmp, zio->io_size, size,
 		    &zio->io_prop.zp_complevel);
 		abd_return_buf_copy(data, tmp, size);
 
 		if (zio_injection_enabled && ret == 0)
 			ret = zio_handle_fault_injection(zio, EINVAL);
 
 		if (ret != 0)
 			zio->io_error = SET_ERROR(EIO);
 	}
 }
 
 static void
 zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
 {
 	int ret;
 	void *tmp;
 	blkptr_t *bp = zio->io_bp;
 	spa_t *spa = zio->io_spa;
 	uint64_t dsobj = zio->io_bookmark.zb_objset;
 	uint64_t lsize = BP_GET_LSIZE(bp);
 	dmu_object_type_t ot = BP_GET_TYPE(bp);
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	ASSERT(BP_USES_CRYPT(bp));
 	ASSERT3U(size, !=, 0);
 
 	if (zio->io_error != 0)
 		return;
 
 	/*
 	 * Verify the cksum of MACs stored in an indirect bp. It will always
 	 * be possible to verify this since it does not require an encryption
 	 * key.
 	 */
 	if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
 		zio_crypt_decode_mac_bp(bp, mac);
 
 		if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 			/*
 			 * We haven't decompressed the data yet, but
 			 * zio_crypt_do_indirect_mac_checksum() requires
 			 * decompressed data to be able to parse out the MACs
 			 * from the indirect block. We decompress it now and
 			 * throw away the result after we are finished.
 			 */
 			tmp = zio_buf_alloc(lsize);
 			ret = zio_decompress_data(BP_GET_COMPRESS(bp),
 			    zio->io_abd, tmp, zio->io_size, lsize,
 			    &zio->io_prop.zp_complevel);
 			if (ret != 0) {
 				ret = SET_ERROR(EIO);
 				goto error;
 			}
 			ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
 			    tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
 			zio_buf_free(tmp, lsize);
 		} else {
 			ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
 			    zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
 		}
 		abd_copy(data, zio->io_abd, size);
 
 		if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) {
 			ret = zio_handle_decrypt_injection(spa,
 			    &zio->io_bookmark, ot, ECKSUM);
 		}
 		if (ret != 0)
 			goto error;
 
 		return;
 	}
 
 	/*
 	 * If this is an authenticated block, just check the MAC. It would be
 	 * nice to separate this out into its own flag, but when this was done,
 	 * we had run out of bits in what is now zio_flag_t. Future cleanup
 	 * could make this a flag bit.
 	 */
 	if (BP_IS_AUTHENTICATED(bp)) {
 		if (ot == DMU_OT_OBJSET) {
 			ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa,
 			    dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp));
 		} else {
 			zio_crypt_decode_mac_bp(bp, mac);
 			ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj,
 			    zio->io_abd, size, mac);
 			if (zio_injection_enabled && ret == 0) {
 				ret = zio_handle_decrypt_injection(spa,
 				    &zio->io_bookmark, ot, ECKSUM);
 			}
 		}
 		abd_copy(data, zio->io_abd, size);
 
 		if (ret != 0)
 			goto error;
 
 		return;
 	}
 
 	zio_crypt_decode_params_bp(bp, salt, iv);
 
 	if (ot == DMU_OT_INTENT_LOG) {
 		tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t));
 		zio_crypt_decode_mac_zil(tmp, mac);
 		abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t));
 	} else {
 		zio_crypt_decode_mac_bp(bp, mac);
 	}
 
 	ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp),
 	    BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data,
 	    zio->io_abd, &no_crypt);
 	if (no_crypt)
 		abd_copy(data, zio->io_abd, size);
 
 	if (ret != 0)
 		goto error;
 
 	return;
 
 error:
 	/* assert that the key was found unless this was speculative */
 	ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE));
 
 	/*
 	 * If there was a decryption / authentication error return EIO as
 	 * the io_error. If this was not a speculative zio, create an ereport.
 	 */
 	if (ret == ECKSUM) {
 		zio->io_error = SET_ERROR(EIO);
 		if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 			spa_log_error(spa, &zio->io_bookmark,
-			    &zio->io_bp->blk_birth);
+			    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 			    spa, NULL, &zio->io_bookmark, zio, 0);
 		}
 	} else {
 		zio->io_error = ret;
 	}
 }
 
 /*
  * ==========================================================================
  * I/O parent/child relationships and pipeline interlocks
  * ==========================================================================
  */
 zio_t *
 zio_walk_parents(zio_t *cio, zio_link_t **zl)
 {
 	list_t *pl = &cio->io_parent_list;
 
 	*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
 	if (*zl == NULL)
 		return (NULL);
 
 	ASSERT((*zl)->zl_child == cio);
 	return ((*zl)->zl_parent);
 }
 
 zio_t *
 zio_walk_children(zio_t *pio, zio_link_t **zl)
 {
 	list_t *cl = &pio->io_child_list;
 
 	ASSERT(MUTEX_HELD(&pio->io_lock));
 
 	*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
 	if (*zl == NULL)
 		return (NULL);
 
 	ASSERT((*zl)->zl_parent == pio);
 	return ((*zl)->zl_child);
 }
 
 zio_t *
 zio_unique_parent(zio_t *cio)
 {
 	zio_link_t *zl = NULL;
 	zio_t *pio = zio_walk_parents(cio, &zl);
 
 	VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
 	return (pio);
 }
 
 void
 zio_add_child(zio_t *pio, zio_t *cio)
 {
 	/*
 	 * Logical I/Os can have logical, gang, or vdev children.
 	 * Gang I/Os can have gang or vdev children.
 	 * Vdev I/Os can only have vdev children.
 	 * The following ASSERT captures all of these constraints.
 	 */
 	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
 
 	/* Parent should not have READY stage if child doesn't have it. */
 	IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 &&
 	    (cio->io_child_type != ZIO_CHILD_VDEV),
 	    (pio->io_pipeline & ZIO_STAGE_READY) == 0);
 
 	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 	zl->zl_parent = pio;
 	zl->zl_child = cio;
 
 	mutex_enter(&pio->io_lock);
 	mutex_enter(&cio->io_lock);
 
 	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 
 	uint64_t *countp = pio->io_children[cio->io_child_type];
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		countp[w] += !cio->io_state[w];
 
 	list_insert_head(&pio->io_child_list, zl);
 	list_insert_head(&cio->io_parent_list, zl);
 
 	mutex_exit(&cio->io_lock);
 	mutex_exit(&pio->io_lock);
 }
 
 void
 zio_add_child_first(zio_t *pio, zio_t *cio)
 {
 	/*
 	 * Logical I/Os can have logical, gang, or vdev children.
 	 * Gang I/Os can have gang or vdev children.
 	 * Vdev I/Os can only have vdev children.
 	 * The following ASSERT captures all of these constraints.
 	 */
 	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
 
 	/* Parent should not have READY stage if child doesn't have it. */
 	IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 &&
 	    (cio->io_child_type != ZIO_CHILD_VDEV),
 	    (pio->io_pipeline & ZIO_STAGE_READY) == 0);
 
 	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 	zl->zl_parent = pio;
 	zl->zl_child = cio;
 
 	ASSERT(list_is_empty(&cio->io_parent_list));
 	list_insert_head(&cio->io_parent_list, zl);
 
 	mutex_enter(&pio->io_lock);
 
 	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 
 	uint64_t *countp = pio->io_children[cio->io_child_type];
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		countp[w] += !cio->io_state[w];
 
 	list_insert_head(&pio->io_child_list, zl);
 
 	mutex_exit(&pio->io_lock);
 }
 
 static void
 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 {
 	ASSERT(zl->zl_parent == pio);
 	ASSERT(zl->zl_child == cio);
 
 	mutex_enter(&pio->io_lock);
 	mutex_enter(&cio->io_lock);
 
 	list_remove(&pio->io_child_list, zl);
 	list_remove(&cio->io_parent_list, zl);
 
 	mutex_exit(&cio->io_lock);
 	mutex_exit(&pio->io_lock);
 	kmem_cache_free(zio_link_cache, zl);
 }
 
 static boolean_t
 zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
 {
 	boolean_t waiting = B_FALSE;
 
 	mutex_enter(&zio->io_lock);
 	ASSERT(zio->io_stall == NULL);
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
 		if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
 			continue;
 
 		uint64_t *countp = &zio->io_children[c][wait];
 		if (*countp != 0) {
 			zio->io_stage >>= 1;
 			ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
 			zio->io_stall = countp;
 			waiting = B_TRUE;
 			break;
 		}
 	}
 	mutex_exit(&zio->io_lock);
 	return (waiting);
 }
 
 __attribute__((always_inline))
 static inline void
 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
     zio_t **next_to_executep)
 {
 	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 	int *errorp = &pio->io_child_error[zio->io_child_type];
 
 	mutex_enter(&pio->io_lock);
 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		*errorp = zio_worst_error(*errorp, zio->io_error);
 	pio->io_reexecute |= zio->io_reexecute;
 	ASSERT3U(*countp, >, 0);
 
 	(*countp)--;
 
 	if (*countp == 0 && pio->io_stall == countp) {
 		zio_taskq_type_t type =
 		    pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
 		    ZIO_TASKQ_INTERRUPT;
 		pio->io_stall = NULL;
 		mutex_exit(&pio->io_lock);
 
 		/*
 		 * If we can tell the caller to execute this parent next, do
 		 * so. We only do this if the parent's zio type matches the
 		 * child's type. Otherwise dispatch the parent zio in its
 		 * own taskq.
 		 *
 		 * Having the caller execute the parent when possible reduces
 		 * locking on the zio taskq's, reduces context switch
 		 * overhead, and has no recursion penalty.  Note that one
 		 * read from disk typically causes at least 3 zio's: a
 		 * zio_null(), the logical zio_read(), and then a physical
 		 * zio.  When the physical ZIO completes, we are able to call
 		 * zio_done() on all 3 of these zio's from one invocation of
 		 * zio_execute() by returning the parent back to
 		 * zio_execute().  Since the parent isn't executed until this
 		 * thread returns back to zio_execute(), the caller should do
 		 * so promptly.
 		 *
 		 * In other cases, dispatching the parent prevents
 		 * overflowing the stack when we have deeply nested
 		 * parent-child relationships, as we do with the "mega zio"
 		 * of writes for spa_sync(), and the chain of ZIL blocks.
 		 */
 		if (next_to_executep != NULL && *next_to_executep == NULL &&
 		    pio->io_type == zio->io_type) {
 			*next_to_executep = pio;
 		} else {
 			zio_taskq_dispatch(pio, type, B_FALSE);
 		}
 	} else {
 		mutex_exit(&pio->io_lock);
 	}
 }
 
 static void
 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 {
 	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 		zio->io_error = zio->io_child_error[c];
 }
 
 int
 zio_bookmark_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = x1;
 	const zio_t *z2 = x2;
 
 	if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
 		return (-1);
 	if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
 		return (1);
 
 	if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
 		return (-1);
 	if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
 		return (1);
 
 	if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
 		return (-1);
 	if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
 		return (1);
 
 	if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
 		return (-1);
 	if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
 		return (1);
 
 	if (z1 < z2)
 		return (-1);
 	if (z1 > z2)
 		return (1);
 
 	return (0);
 }
 
 /*
  * ==========================================================================
  * Create the various types of I/O (read, write, free, etc)
  * ==========================================================================
  */
 static zio_t *
 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
     void *private, zio_type_t type, zio_priority_t priority,
     zio_flag_t flags, vdev_t *vd, uint64_t offset,
     const zbookmark_phys_t *zb, enum zio_stage stage,
     enum zio_stage pipeline)
 {
 	zio_t *zio;
 
 	IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE);
 	ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 
 	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 	ASSERT(vd || stage == ZIO_STAGE_OPEN);
 
 	IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0);
 
 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 	memset(zio, 0, sizeof (zio_t));
 
 	mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 
 	list_create(&zio->io_parent_list, sizeof (zio_link_t),
 	    offsetof(zio_link_t, zl_parent_node));
 	list_create(&zio->io_child_list, sizeof (zio_link_t),
 	    offsetof(zio_link_t, zl_child_node));
 	metaslab_trace_init(&zio->io_alloc_list);
 
 	if (vd != NULL)
 		zio->io_child_type = ZIO_CHILD_VDEV;
 	else if (flags & ZIO_FLAG_GANG_CHILD)
 		zio->io_child_type = ZIO_CHILD_GANG;
 	else if (flags & ZIO_FLAG_DDT_CHILD)
 		zio->io_child_type = ZIO_CHILD_DDT;
 	else
 		zio->io_child_type = ZIO_CHILD_LOGICAL;
 
 	if (bp != NULL) {
 		if (type != ZIO_TYPE_WRITE ||
 		    zio->io_child_type == ZIO_CHILD_DDT) {
 			zio->io_bp_copy = *bp;
 			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
 		} else {
 			zio->io_bp = (blkptr_t *)bp;
 		}
 		zio->io_bp_orig = *bp;
 		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 			zio->io_logical = zio;
 		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 			pipeline |= ZIO_GANG_STAGES;
 	}
 
 	zio->io_spa = spa;
 	zio->io_txg = txg;
 	zio->io_done = done;
 	zio->io_private = private;
 	zio->io_type = type;
 	zio->io_priority = priority;
 	zio->io_vd = vd;
 	zio->io_offset = offset;
 	zio->io_orig_abd = zio->io_abd = data;
 	zio->io_orig_size = zio->io_size = psize;
 	zio->io_lsize = lsize;
 	zio->io_orig_flags = zio->io_flags = flags;
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 	zio->io_pipeline_trace = ZIO_STAGE_OPEN;
 	zio->io_allocator = ZIO_ALLOCATOR_NONE;
 
 	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) ||
 	    (pipeline & ZIO_STAGE_READY) == 0;
 	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 
 	if (zb != NULL)
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
 		zio->io_metaslab_class = pio->io_metaslab_class;
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
 		if (zio->io_child_type == ZIO_CHILD_GANG)
 			zio->io_gang_leader = pio->io_gang_leader;
 		zio_add_child_first(pio, zio);
 	}
 
 	taskq_init_ent(&zio->io_tqent);
 
 	return (zio);
 }
 
 void
 zio_destroy(zio_t *zio)
 {
 	metaslab_trace_fini(&zio->io_alloc_list);
 	list_destroy(&zio->io_parent_list);
 	list_destroy(&zio->io_child_list);
 	mutex_destroy(&zio->io_lock);
 	cv_destroy(&zio->io_cv);
 	kmem_cache_free(zio_cache, zio);
 }
 
 /*
  * ZIO intended to be between others.  Provides synchronization at READY
  * and DONE pipeline stages and calls the respective callbacks.
  */
 zio_t *
 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
     void *private, zio_flag_t flags)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 
 	return (zio);
 }
 
 /*
  * ZIO intended to be a root of a tree.  Unlike null ZIO does not have a
  * READY pipeline stage (is ready on creation), so it should not be used
  * as child of any ZIO that may need waiting for grandchildren READY stage
  * (any other ZIO type).
  */
 zio_t *
 zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags)
 {
 	zio_t *zio;
 
 	zio = zio_create(NULL, spa, 0, NULL, NULL, 0, 0, done, private,
 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL,
 	    ZIO_STAGE_OPEN, ZIO_ROOT_PIPELINE);
 
 	return (zio);
 }
 
 static int
 zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
     enum blk_verify_flag blk_verify, const char *fmt, ...)
 {
 	va_list adx;
 	char buf[256];
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	zfs_dbgmsg("bad blkptr at %px: "
 	    "DVA[0]=%#llx/%#llx "
 	    "DVA[1]=%#llx/%#llx "
 	    "DVA[2]=%#llx/%#llx "
 	    "prop=%#llx "
 	    "pad=%#llx,%#llx "
 	    "phys_birth=%#llx "
 	    "birth=%#llx "
 	    "fill=%#llx "
 	    "cksum=%#llx/%#llx/%#llx/%#llx",
 	    bp,
 	    (long long)bp->blk_dva[0].dva_word[0],
 	    (long long)bp->blk_dva[0].dva_word[1],
 	    (long long)bp->blk_dva[1].dva_word[0],
 	    (long long)bp->blk_dva[1].dva_word[1],
 	    (long long)bp->blk_dva[2].dva_word[0],
 	    (long long)bp->blk_dva[2].dva_word[1],
 	    (long long)bp->blk_prop,
 	    (long long)bp->blk_pad[0],
 	    (long long)bp->blk_pad[1],
-	    (long long)bp->blk_phys_birth,
-	    (long long)bp->blk_birth,
+	    (long long)BP_GET_PHYSICAL_BIRTH(bp),
+	    (long long)BP_GET_LOGICAL_BIRTH(bp),
 	    (long long)bp->blk_fill,
 	    (long long)bp->blk_cksum.zc_word[0],
 	    (long long)bp->blk_cksum.zc_word[1],
 	    (long long)bp->blk_cksum.zc_word[2],
 	    (long long)bp->blk_cksum.zc_word[3]);
 	switch (blk_verify) {
 	case BLK_VERIFY_HALT:
 		zfs_panic_recover("%s: %s", spa_name(spa), buf);
 		break;
 	case BLK_VERIFY_LOG:
 		zfs_dbgmsg("%s: %s", spa_name(spa), buf);
 		break;
 	case BLK_VERIFY_ONLY:
 		break;
 	}
 
 	return (1);
 }
 
 /*
  * Verify the block pointer fields contain reasonable values.  This means
  * it only contains known object types, checksum/compression identifiers,
  * block sizes within the maximum allowed limits, valid DVAs, etc.
  *
  * If everything checks out B_TRUE is returned.  The zfs_blkptr_verify
  * argument controls the behavior when an invalid field is detected.
  *
  * Values for blk_verify_flag:
  *   BLK_VERIFY_ONLY: evaluate the block
  *   BLK_VERIFY_LOG: evaluate the block and log problems
  *   BLK_VERIFY_HALT: call zfs_panic_recover on error
  *
  * Values for blk_config_flag:
  *   BLK_CONFIG_HELD: caller holds SCL_VDEV for writer
  *   BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be
  *   obtained for reader
  *   BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better
  *   performance
  */
 boolean_t
 zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
     enum blk_config_flag blk_config, enum blk_verify_flag blk_verify)
 {
 	int errors = 0;
 
 	if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid TYPE %llu",
 		    bp, (longlong_t)BP_GET_TYPE(bp));
 	}
 	if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid CHECKSUM %llu",
 		    bp, (longlong_t)BP_GET_CHECKSUM(bp));
 	}
 	if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid COMPRESS %llu",
 		    bp, (longlong_t)BP_GET_COMPRESS(bp));
 	}
 	if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid LSIZE %llu",
 		    bp, (longlong_t)BP_GET_LSIZE(bp));
 	}
 	if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid PSIZE %llu",
 		    bp, (longlong_t)BP_GET_PSIZE(bp));
 	}
 
 	if (BP_IS_EMBEDDED(bp)) {
 		if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px has invalid ETYPE %llu",
 			    bp, (longlong_t)BPE_GET_ETYPE(bp));
 		}
 	}
 
 	/*
 	 * Do not verify individual DVAs if the config is not trusted. This
 	 * will be done once the zio is executed in vdev_mirror_map_alloc.
 	 */
 	if (!spa->spa_trust_config)
 		return (errors == 0);
 
 	switch (blk_config) {
 	case BLK_CONFIG_HELD:
 		ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER));
 		break;
 	case BLK_CONFIG_NEEDED:
 		spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
 		break;
 	case BLK_CONFIG_SKIP:
 		return (errors == 0);
 	default:
 		panic("invalid blk_config %u", blk_config);
 	}
 
 	/*
 	 * Pool-specific checks.
 	 *
-	 * Note: it would be nice to verify that the blk_birth and
-	 * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
-	 * allows the birth time of log blocks (and dmu_sync()-ed blocks
-	 * that are in the log) to be arbitrarily large.
+	 * Note: it would be nice to verify that the logical birth
+	 * and physical birth are not too large.  However,
+	 * spa_freeze() allows the birth time of log blocks (and
+	 * dmu_sync()-ed blocks that are in the log) to be arbitrarily
+	 * large.
 	 */
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		const dva_t *dva = &bp->blk_dva[i];
 		uint64_t vdevid = DVA_GET_VDEV(dva);
 
 		if (vdevid >= spa->spa_root_vdev->vdev_children) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has invalid VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
 		if (vd == NULL) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has invalid VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		if (vd->vdev_ops == &vdev_hole_ops) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has hole VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		if (vd->vdev_ops == &vdev_missing_ops) {
 			/*
 			 * "missing" vdevs are valid during import, but we
 			 * don't have their detailed info (e.g. asize), so
 			 * we can't perform any more checks on them.
 			 */
 			continue;
 		}
 		uint64_t offset = DVA_GET_OFFSET(dva);
 		uint64_t asize = DVA_GET_ASIZE(dva);
 		if (DVA_GET_GANG(dva))
 			asize = vdev_gang_header_asize(vd);
 		if (offset + asize > vd->vdev_asize) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has invalid OFFSET %llu",
 			    bp, i, (longlong_t)offset);
 		}
 	}
 	if (blk_config == BLK_CONFIG_NEEDED)
 		spa_config_exit(spa, SCL_VDEV, bp);
 
 	return (errors == 0);
 }
 
 boolean_t
 zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
 {
 	(void) bp;
 	uint64_t vdevid = DVA_GET_VDEV(dva);
 
 	if (vdevid >= spa->spa_root_vdev->vdev_children)
 		return (B_FALSE);
 
 	vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
 	if (vd == NULL)
 		return (B_FALSE);
 
 	if (vd->vdev_ops == &vdev_hole_ops)
 		return (B_FALSE);
 
 	if (vd->vdev_ops == &vdev_missing_ops) {
 		return (B_FALSE);
 	}
 
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t asize = DVA_GET_ASIZE(dva);
 
 	if (DVA_GET_GANG(dva))
 		asize = vdev_gang_header_asize(vd);
 	if (offset + asize > vd->vdev_asize)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
-	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
+	zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp,
 	    data, size, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 
 	return (zio);
 }
 
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
     zio_done_func_t *done, void *private, zio_priority_t priority,
     zio_flag_t flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
 	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 	    DMU_OT_IS_VALID(zp->zp_type) &&
 	    zp->zp_level < 32 &&
 	    zp->zp_copies > 0 &&
 	    zp->zp_copies <= spa_max_replication(spa));
 
 	zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 
 	zio->io_ready = ready;
 	zio->io_children_ready = children_ready;
 	zio->io_prop = *zp;
 
 	/*
 	 * Data can be NULL if we are going to call zio_write_override() to
 	 * provide the already-allocated BP.  But we may need the data to
 	 * verify a dedup hit (if requested).  In this case, don't try to
 	 * dedup (just take the already-allocated BP verbatim). Encrypted
 	 * dedup blocks need data as well so we also disable dedup in this
 	 * case.
 	 */
 	if (data == NULL &&
 	    (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) {
 		zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
 	}
 
 	return (zio);
 }
 
 zio_t *
 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
     uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 
 	return (zio);
 }
 
 void
 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
     boolean_t brtwrite)
 {
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 	ASSERT(!brtwrite || !nopwrite);
 
 	/*
 	 * We must reset the io_prop to match the values that existed
 	 * when the bp was first written by dmu_sync() keeping in mind
 	 * that nopwrite and dedup are mutually exclusive.
 	 */
 	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 	zio->io_prop.zp_nopwrite = nopwrite;
 	zio->io_prop.zp_brtwrite = brtwrite;
 	zio->io_prop.zp_copies = copies;
 	zio->io_bp_override = bp;
 }
 
 void
 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 {
 
 	(void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
 
 	/*
 	 * The check for EMBEDDED is a performance optimization.  We
 	 * process the free here (by ignoring it) rather than
 	 * putting it on the list and then processing it in zio_free_sync().
 	 */
 	if (BP_IS_EMBEDDED(bp))
 		return;
 
 	/*
 	 * Frees that are for the currently-syncing txg, are not going to be
 	 * deferred, and which will not need to do a read (i.e. not GANG or
 	 * DEDUP), can be processed immediately.  Otherwise, put them on the
 	 * in-memory list for later processing.
 	 *
 	 * Note that we only defer frees after zfs_sync_pass_deferred_free
 	 * when the log space map feature is disabled. [see relevant comment
 	 * in spa_sync_iterate_to_convergence()]
 	 */
 	if (BP_IS_GANG(bp) ||
 	    BP_GET_DEDUP(bp) ||
 	    txg != spa->spa_syncing_txg ||
 	    (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free &&
 	    !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) ||
 	    brt_maybe_exists(spa, bp)) {
 		metaslab_check_free(spa, bp);
 		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 	} else {
 		VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL);
 	}
 }
 
 /*
  * To improve performance, this function may return NULL if we were able
  * to do the free immediately.  This avoids the cost of creating a zio
  * (and linking it to the parent, etc).
  */
 zio_t *
 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     zio_flag_t flags)
 {
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(spa_syncing_txg(spa) == txg);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (NULL);
 
 	metaslab_check_free(spa, bp);
 	arc_freed(spa, bp);
 	dsl_scan_freed(spa, bp);
 
 	if (BP_IS_GANG(bp) ||
 	    BP_GET_DEDUP(bp) ||
 	    brt_maybe_exists(spa, bp)) {
 		/*
 		 * GANG, DEDUP and BRT blocks can induce a read (for the gang
 		 * block header, the DDT or the BRT), so issue them
 		 * asynchronously so that this thread is not tied up.
 		 */
 		enum zio_stage stage =
 		    ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC;
 
 		return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 		    BP_GET_PSIZE(bp), NULL, NULL,
 		    ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
 		    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage));
 	} else {
 		metaslab_free(spa, bp, txg, B_FALSE);
 		return (NULL);
 	}
 }
 
 zio_t *
 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     zio_done_func_t *done, void *private, zio_flag_t flags)
 {
 	zio_t *zio;
 
 	(void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ?
 	    BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 
 	/*
 	 * A claim is an allocation of a specific block.  Claims are needed
 	 * to support immediate writes in the intent log.  The issue is that
 	 * immediate writes contain committed data, but in a txg that was
 	 * *not* committed.  Upon opening the pool after an unclean shutdown,
 	 * the intent log claims all blocks that contain immediate write data
 	 * so that the SPA knows they're in use.
 	 *
 	 * All claims *must* be resolved in the first txg -- before the SPA
 	 * starts allocating blocks -- so that nothing is allocated twice.
 	 * If txg == 0 we just verify that the block is claimable.
 	 */
-	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp), <,
 	    spa_min_claim_txg(spa));
 	ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
 	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(8) */
 
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 	    BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
 	    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 	ASSERT0(zio->io_queued_timestamp);
 
 	return (zio);
 }
 
 zio_t *
 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
     zio_done_func_t *done, void *private, zio_flag_t flags)
 {
 	zio_t *zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
 	    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 	    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 	zio->io_cmd = cmd;
 	return (zio);
 }
 
 zio_t *
 zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     zio_done_func_t *done, void *private, zio_priority_t priority,
     zio_flag_t flags, enum trim_flag trim_flags)
 {
 	zio_t *zio;
 
 	ASSERT0(vd->vdev_children);
 	ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 	ASSERT3U(size, !=, 0);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done,
 	    private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL,
 	    vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE);
 	zio->io_trim_flags = trim_flags;
 
 	return (zio);
 }
 
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     abd_t *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, boolean_t labels)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_children == 0);
 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
 	    private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
 	    offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
 	return (zio);
 }
 
 zio_t *
 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     abd_t *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, boolean_t labels)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_children == 0);
 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
 	    private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
 	    offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
 	if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		/*
 		 * zec checksums are necessarily destructive -- they modify
 		 * the end of the write buffer to hold the verifier/checksum.
 		 * Therefore, we must make a local copy in case the data is
 		 * being written to multiple places in parallel.
 		 */
 		abd_t *wbuf = abd_alloc_sametype(data, size);
 		abd_copy(wbuf, data, size);
 
 		zio_push_transform(zio, wbuf, size, size, NULL);
 	}
 
 	return (zio);
 }
 
 /*
  * Create a child I/O to do some work for us.
  */
 zio_t *
 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
     abd_t *data, uint64_t size, int type, zio_priority_t priority,
     zio_flag_t flags, zio_done_func_t *done, void *private)
 {
 	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 	zio_t *zio;
 
 	/*
 	 * vdev child I/Os do not propagate their error to the parent.
 	 * Therefore, for correct operation the caller *must* check for
 	 * and handle the error in the child i/o's done callback.
 	 * The only exceptions are i/os that we don't care about
 	 * (OPTIONAL or REPAIR).
 	 */
 	ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
 	    done != NULL);
 
 	if (type == ZIO_TYPE_READ && bp != NULL) {
 		/*
 		 * If we have the bp, then the child should perform the
 		 * checksum and the parent need not.  This pushes error
 		 * detection as close to the leaves as possible and
 		 * eliminates redundant checksums in the interior nodes.
 		 */
 		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		ASSERT0(vd->vdev_children);
 		offset += VDEV_LABEL_START_SIZE;
 	}
 
 	flags |= ZIO_VDEV_CHILD_FLAGS(pio);
 
 	/*
 	 * If we've decided to do a repair, the write is not speculative --
 	 * even if the original read was.
 	 */
 	if (flags & ZIO_FLAG_IO_REPAIR)
 		flags &= ~ZIO_FLAG_SPECULATIVE;
 
 	/*
 	 * If we're creating a child I/O that is not associated with a
 	 * top-level vdev, then the child zio is not an allocating I/O.
 	 * If this is a retried I/O then we ignore it since we will
 	 * have already processed the original allocating I/O.
 	 */
 	if (flags & ZIO_FLAG_IO_ALLOCATING &&
 	    (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
 		ASSERT(pio->io_metaslab_class != NULL);
 		ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
 		ASSERT(type == ZIO_TYPE_WRITE);
 		ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
 		ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
 		    pio->io_child_type == ZIO_CHILD_GANG);
 
 		flags &= ~ZIO_FLAG_IO_ALLOCATING;
 	}
 
 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
 	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 
 	return (zio);
 }
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
     zio_type_t type, zio_priority_t priority, zio_flag_t flags,
     zio_done_func_t *done, void *private)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 	    data, size, size, done, private, type, priority,
 	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
 	    vd, offset, NULL,
 	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 
 	return (zio);
 }
 
 void
 zio_flush(zio_t *pio, vdev_t *vd)
 {
 	if (vd->vdev_nowritecache)
 		return;
 	if (vd->vdev_children == 0) {
 		zio_nowait(zio_ioctl(pio, vd->vdev_spa, vd,
 		    DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL |
 		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 	} else {
 		for (uint64_t c = 0; c < vd->vdev_children; c++)
 			zio_flush(pio, vd->vdev_child[c]);
 	}
 }
 
 void
 zio_shrink(zio_t *zio, uint64_t size)
 {
 	ASSERT3P(zio->io_executor, ==, NULL);
 	ASSERT3U(zio->io_orig_size, ==, zio->io_size);
 	ASSERT3U(size, <=, zio->io_size);
 
 	/*
 	 * We don't shrink for raidz because of problems with the
 	 * reconstruction when reading back less than the block size.
 	 * Note, BP_IS_RAIDZ() assumes no compression.
 	 */
 	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 	if (!BP_IS_RAIDZ(zio->io_bp)) {
 		/* we are not doing a raw write */
 		ASSERT3U(zio->io_size, ==, zio->io_lsize);
 		zio->io_orig_size = zio->io_size = zio->io_lsize = size;
 	}
 }
 
 /*
  * Round provided allocation size up to a value that can be allocated
  * by at least some vdev(s) in the pool with minimum or no additional
  * padding and without extra space usage on others
  */
 static uint64_t
 zio_roundup_alloc_size(spa_t *spa, uint64_t size)
 {
 	if (size > spa->spa_min_alloc)
 		return (roundup(size, spa->spa_gcd_alloc));
 	return (spa->spa_min_alloc);
 }
 
 /*
  * ==========================================================================
  * Prepare to read and write logical blocks
  * ==========================================================================
  */
 
 static zio_t *
 zio_read_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize =
 	    BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
 
 	ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
 
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
 		zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
 		    psize, psize, zio_decompress);
 	}
 
 	if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) ||
 	    BP_HAS_INDIRECT_MAC_CKSUM(bp)) &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL) {
 		zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
 		    psize, psize, zio_decrypt);
 	}
 
 	if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
 		int psize = BPE_GET_PSIZE(bp);
 		void *data = abd_borrow_buf(zio->io_abd, psize);
 
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		decode_embedded_bp_compressed(bp, data);
 		abd_return_buf_copy(zio->io_abd, data, psize);
 	} else {
 		ASSERT(!BP_IS_EMBEDDED(bp));
 	}
 
 	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
 		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
 
 	return (zio);
 }
 
 static zio_t *
 zio_write_bp_init(zio_t *zio)
 {
 	if (!IO_IS_ALLOCATING(zio))
 		return (zio);
 
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 
 	if (zio->io_bp_override) {
 		blkptr_t *bp = zio->io_bp;
 		zio_prop_t *zp = &zio->io_prop;
 
-		ASSERT(bp->blk_birth != zio->io_txg);
+		ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg);
 
 		*bp = *zio->io_bp_override;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 		if (zp->zp_brtwrite)
 			return (zio);
 
 		ASSERT(!BP_GET_DEDUP(zio->io_bp_override));
 
 		if (BP_IS_EMBEDDED(bp))
 			return (zio);
 
 		/*
 		 * If we've been overridden and nopwrite is set then
 		 * set the flag accordingly to indicate that a nopwrite
 		 * has already occurred.
 		 */
 		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
 			ASSERT(!zp->zp_dedup);
 			ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
 			zio->io_flags |= ZIO_FLAG_NOPWRITE;
 			return (zio);
 		}
 
 		ASSERT(!zp->zp_nopwrite);
 
 		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
 			return (zio);
 
 		ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
 
 		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum &&
 		    !zp->zp_encrypt) {
 			BP_SET_DEDUP(bp, 1);
 			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
 			return (zio);
 		}
 
 		/*
 		 * We were unable to handle this as an override bp, treat
 		 * it as a regular write I/O.
 		 */
 		zio->io_bp_override = NULL;
 		*bp = zio->io_bp_orig;
 		zio->io_pipeline = zio->io_orig_pipeline;
 	}
 
 	return (zio);
 }
 
 static zio_t *
 zio_write_compress(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_prop_t *zp = &zio->io_prop;
 	enum zio_compress compress = zp->zp_compress;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t lsize = zio->io_lsize;
 	uint64_t psize = zio->io_size;
 	uint32_t pass = 1;
 
 	/*
 	 * If our children haven't all reached the ready stage,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
 	    ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
 		return (NULL);
 	}
 
 	if (!IO_IS_ALLOCATING(zio))
 		return (zio);
 
 	if (zio->io_children_ready != NULL) {
 		/*
 		 * Now that all our children are ready, run the callback
 		 * associated with this zio in case it wants to modify the
 		 * data to be written.
 		 */
 		ASSERT3U(zp->zp_level, >, 0);
 		zio->io_children_ready(zio);
 	}
 
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 	ASSERT(zio->io_bp_override == NULL);
 
-	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
+	if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) {
 		/*
 		 * We're rewriting an existing block, which means we're
 		 * working on behalf of spa_sync().  For spa_sync() to
 		 * converge, it must eventually be the case that we don't
 		 * have to allocate new blocks.  But compression changes
 		 * the blocksize, which forces a reallocate, and makes
 		 * convergence take longer.  Therefore, after the first
 		 * few passes, stop compressing to ensure convergence.
 		 */
 		pass = spa_sync_pass(spa);
 
 		ASSERT(zio->io_txg == spa_syncing_txg(spa));
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(!BP_GET_DEDUP(bp));
 
 		if (pass >= zfs_sync_pass_dont_compress)
 			compress = ZIO_COMPRESS_OFF;
 
 		/* Make sure someone doesn't change their mind on overwrites */
 		ASSERT(BP_IS_EMBEDDED(bp) || BP_IS_GANG(bp) ||
 		    MIN(zp->zp_copies, spa_max_replication(spa))
 		    == BP_GET_NDVAS(bp));
 	}
 
 	/* If it's a compressed write that is not raw, compress the buffer. */
 	if (compress != ZIO_COMPRESS_OFF &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
 		void *cbuf = NULL;
 		psize = zio_compress_data(compress, zio->io_abd, &cbuf, lsize,
 		    zp->zp_complevel);
 		if (psize == 0) {
 			compress = ZIO_COMPRESS_OFF;
 		} else if (psize >= lsize) {
 			compress = ZIO_COMPRESS_OFF;
 			if (cbuf != NULL)
 				zio_buf_free(cbuf, lsize);
 		} else if (!zp->zp_dedup && !zp->zp_encrypt &&
 		    psize <= BPE_PAYLOAD_SIZE &&
 		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
 		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
 			encode_embedded_bp_compressed(bp,
 			    cbuf, compress, lsize, psize);
 			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
 			BP_SET_TYPE(bp, zio->io_prop.zp_type);
 			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
 			zio_buf_free(cbuf, lsize);
-			bp->blk_birth = zio->io_txg;
+			BP_SET_LOGICAL_BIRTH(bp, zio->io_txg);
 			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 			ASSERT(spa_feature_is_active(spa,
 			    SPA_FEATURE_EMBEDDED_DATA));
 			return (zio);
 		} else {
 			/*
 			 * Round compressed size up to the minimum allocation
 			 * size of the smallest-ashift device, and zero the
 			 * tail. This ensures that the compressed size of the
 			 * BP (and thus compressratio property) are correct,
 			 * in that we charge for the padding used to fill out
 			 * the last sector.
 			 */
 			size_t rounded = (size_t)zio_roundup_alloc_size(spa,
 			    psize);
 			if (rounded >= lsize) {
 				compress = ZIO_COMPRESS_OFF;
 				zio_buf_free(cbuf, lsize);
 				psize = lsize;
 			} else {
 				abd_t *cdata = abd_get_from_buf(cbuf, lsize);
 				abd_take_ownership_of_buf(cdata, B_TRUE);
 				abd_zero_off(cdata, psize, rounded - psize);
 				psize = rounded;
 				zio_push_transform(zio, cdata,
 				    psize, lsize, NULL);
 			}
 		}
 
 		/*
 		 * We were unable to handle this as an override bp, treat
 		 * it as a regular write I/O.
 		 */
 		zio->io_bp_override = NULL;
 		*bp = zio->io_bp_orig;
 		zio->io_pipeline = zio->io_orig_pipeline;
 
 	} else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 &&
 	    zp->zp_type == DMU_OT_DNODE) {
 		/*
 		 * The DMU actually relies on the zio layer's compression
 		 * to free metadnode blocks that have had all contained
 		 * dnodes freed. As a result, even when doing a raw
 		 * receive, we must check whether the block can be compressed
 		 * to a hole.
 		 */
 		psize = zio_compress_data(ZIO_COMPRESS_EMPTY,
 		    zio->io_abd, NULL, lsize, zp->zp_complevel);
 		if (psize == 0 || psize >= lsize)
 			compress = ZIO_COMPRESS_OFF;
 	} else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) {
 		/*
 		 * If we are raw receiving an encrypted dataset we should not
 		 * take this codepath because it will change the on-disk block
 		 * and decryption will fail.
 		 */
 		size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize),
 		    lsize);
 
 		if (rounded != psize) {
 			abd_t *cdata = abd_alloc_linear(rounded, B_TRUE);
 			abd_zero_off(cdata, psize, rounded - psize);
 			abd_copy_off(cdata, zio->io_abd, 0, 0, psize);
 			psize = rounded;
 			zio_push_transform(zio, cdata,
 			    psize, rounded, NULL);
 		}
 	} else {
 		ASSERT3U(psize, !=, 0);
 	}
 
 	/*
 	 * The final pass of spa_sync() must be all rewrites, but the first
 	 * few passes offer a trade-off: allocating blocks defers convergence,
 	 * but newly allocated blocks are sequential, so they can be written
 	 * to disk faster.  Therefore, we allow the first few passes of
 	 * spa_sync() to allocate new blocks, but force rewrites after that.
 	 * There should only be a handful of blocks after pass 1 in any case.
 	 */
-	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
+	if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg &&
 	    BP_GET_PSIZE(bp) == psize &&
 	    pass >= zfs_sync_pass_rewrite) {
 		VERIFY3U(psize, !=, 0);
 		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
 
 		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
 		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
 	} else {
 		BP_ZERO(bp);
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 	}
 
 	if (psize == 0) {
-		if (zio->io_bp_orig.blk_birth != 0 &&
+		if (BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig) != 0 &&
 		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 			BP_SET_LSIZE(bp, lsize);
 			BP_SET_TYPE(bp, zp->zp_type);
 			BP_SET_LEVEL(bp, zp->zp_level);
 			BP_SET_BIRTH(bp, zio->io_txg, 0);
 		}
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 	} else {
 		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
 		BP_SET_LSIZE(bp, lsize);
 		BP_SET_TYPE(bp, zp->zp_type);
 		BP_SET_LEVEL(bp, zp->zp_level);
 		BP_SET_PSIZE(bp, psize);
 		BP_SET_COMPRESS(bp, compress);
 		BP_SET_CHECKSUM(bp, zp->zp_checksum);
 		BP_SET_DEDUP(bp, zp->zp_dedup);
 		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 		if (zp->zp_dedup) {
 			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			ASSERT(!zp->zp_encrypt ||
 			    DMU_OT_IS_ENCRYPTED(zp->zp_type));
 			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
 		}
 		if (zp->zp_nopwrite) {
 			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
 		}
 	}
 	return (zio);
 }
 
 static zio_t *
 zio_free_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
 		if (BP_GET_DEDUP(bp))
 			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
 	}
 
 	ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Execute the I/O pipeline
  * ==========================================================================
  */
 
 static void
 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
 {
 	spa_t *spa = zio->io_spa;
 	zio_type_t t = zio->io_type;
 	int flags = (cutinline ? TQ_FRONT : 0);
 
 	/*
 	 * If we're a config writer or a probe, the normal issue and
 	 * interrupt threads may all be blocked waiting for the config lock.
 	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
 	 */
 	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
 		t = ZIO_TYPE_NULL;
 
 	/*
 	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
 	 */
 	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
 		t = ZIO_TYPE_NULL;
 
 	/*
 	 * If this is a high priority I/O, then use the high priority taskq if
 	 * available.
 	 */
 	if ((zio->io_priority == ZIO_PRIORITY_NOW ||
 	    zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) &&
 	    spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
 		q++;
 
 	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
 
 	/*
 	 * NB: We are assuming that the zio can only be dispatched
 	 * to a single taskq at a time.  It would be a grievous error
 	 * to dispatch the zio to another taskq at the same time.
 	 */
 	ASSERT(taskq_empty_ent(&zio->io_tqent));
 	spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags,
 	    &zio->io_tqent, zio);
 }
 
 static boolean_t
 zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
 {
 	spa_t *spa = zio->io_spa;
 
 	taskq_t *tq = taskq_of_curthread();
 
 	for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
 		spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 		uint_t i;
 		for (i = 0; i < tqs->stqs_count; i++) {
 			if (tqs->stqs_taskq[i] == tq)
 				return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static zio_t *
 zio_issue_async(zio_t *zio)
 {
 	ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio));
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 	return (NULL);
 }
 
 void
 zio_interrupt(void *zio)
 {
 	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
 }
 
 void
 zio_delay_interrupt(zio_t *zio)
 {
 	/*
 	 * The timeout_generic() function isn't defined in userspace, so
 	 * rather than trying to implement the function, the zio delay
 	 * functionality has been disabled for userspace builds.
 	 */
 
 #ifdef _KERNEL
 	/*
 	 * If io_target_timestamp is zero, then no delay has been registered
 	 * for this IO, thus jump to the end of this function and "skip" the
 	 * delay; issuing it directly to the zio layer.
 	 */
 	if (zio->io_target_timestamp != 0) {
 		hrtime_t now = gethrtime();
 
 		if (now >= zio->io_target_timestamp) {
 			/*
 			 * This IO has already taken longer than the target
 			 * delay to complete, so we don't want to delay it
 			 * any longer; we "miss" the delay and issue it
 			 * directly to the zio layer. This is likely due to
 			 * the target latency being set to a value less than
 			 * the underlying hardware can satisfy (e.g. delay
 			 * set to 1ms, but the disks take 10ms to complete an
 			 * IO request).
 			 */
 
 			DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
 			    hrtime_t, now);
 
 			zio_interrupt(zio);
 		} else {
 			taskqid_t tid;
 			hrtime_t diff = zio->io_target_timestamp - now;
 			clock_t expire_at_tick = ddi_get_lbolt() +
 			    NSEC_TO_TICK(diff);
 
 			DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
 			    hrtime_t, now, hrtime_t, diff);
 
 			if (NSEC_TO_TICK(diff) == 0) {
 				/* Our delay is less than a jiffy - just spin */
 				zfs_sleep_until(zio->io_target_timestamp);
 				zio_interrupt(zio);
 			} else {
 				/*
 				 * Use taskq_dispatch_delay() in the place of
 				 * OpenZFS's timeout_generic().
 				 */
 				tid = taskq_dispatch_delay(system_taskq,
 				    zio_interrupt, zio, TQ_NOSLEEP,
 				    expire_at_tick);
 				if (tid == TASKQID_INVALID) {
 					/*
 					 * Couldn't allocate a task.  Just
 					 * finish the zio without a delay.
 					 */
 					zio_interrupt(zio);
 				}
 			}
 		}
 		return;
 	}
 #endif
 	DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
 	zio_interrupt(zio);
 }
 
 static void
 zio_deadman_impl(zio_t *pio, int ziodepth)
 {
 	zio_t *cio, *cio_next;
 	zio_link_t *zl = NULL;
 	vdev_t *vd = pio->io_vd;
 
 	if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) {
 		vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL;
 		zbookmark_phys_t *zb = &pio->io_bookmark;
 		uint64_t delta = gethrtime() - pio->io_timestamp;
 		uint64_t failmode = spa_get_deadman_failmode(pio->io_spa);
 
 		zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu "
 		    "delta=%llu queued=%llu io=%llu "
 		    "path=%s "
 		    "last=%llu type=%d "
 		    "priority=%d flags=0x%llx stage=0x%x "
 		    "pipeline=0x%x pipeline-trace=0x%x "
 		    "objset=%llu object=%llu "
 		    "level=%llu blkid=%llu "
 		    "offset=%llu size=%llu "
 		    "error=%d",
 		    ziodepth, pio, pio->io_timestamp,
 		    (u_longlong_t)delta, pio->io_delta, pio->io_delay,
 		    vd ? vd->vdev_path : "NULL",
 		    vq ? vq->vq_io_complete_ts : 0, pio->io_type,
 		    pio->io_priority, (u_longlong_t)pio->io_flags,
 		    pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace,
 		    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
 		    (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid,
 		    (u_longlong_t)pio->io_offset, (u_longlong_t)pio->io_size,
 		    pio->io_error);
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
 		    pio->io_spa, vd, zb, pio, 0);
 
 		if (failmode == ZIO_FAILURE_MODE_CONTINUE &&
 		    taskq_empty_ent(&pio->io_tqent)) {
 			zio_interrupt(pio);
 		}
 	}
 
 	mutex_enter(&pio->io_lock);
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
 		zio_deadman_impl(cio, ziodepth + 1);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 /*
  * Log the critical information describing this zio and all of its children
  * using the zfs_dbgmsg() interface then post deadman event for the ZED.
  */
 void
 zio_deadman(zio_t *pio, const char *tag)
 {
 	spa_t *spa = pio->io_spa;
 	char *name = spa_name(spa);
 
 	if (!zfs_deadman_enabled || spa_suspended(spa))
 		return;
 
 	zio_deadman_impl(pio, 0);
 
 	switch (spa_get_deadman_failmode(spa)) {
 	case ZIO_FAILURE_MODE_WAIT:
 		zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name);
 		break;
 
 	case ZIO_FAILURE_MODE_CONTINUE:
 		zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name);
 		break;
 
 	case ZIO_FAILURE_MODE_PANIC:
 		fm_panic("%s determined I/O to pool '%s' is hung.", tag, name);
 		break;
 	}
 }
 
 /*
  * Execute the I/O pipeline until one of the following occurs:
  * (1) the I/O completes; (2) the pipeline stalls waiting for
  * dependent child I/Os; (3) the I/O issues, so we're waiting
  * for an I/O completion interrupt; (4) the I/O is delegated by
  * vdev-level caching or aggregation; (5) the I/O is deferred
  * due to vdev-level queueing; (6) the I/O is handed off to
  * another thread.  In all cases, the pipeline stops whenever
  * there's no CPU work; it never burns a thread in cv_wait_io().
  *
  * There's no locking on io_stage because there's no legitimate way
  * for multiple threads to be attempting to process the same I/O.
  */
 static zio_pipe_stage_t *zio_pipeline[];
 
 /*
  * zio_execute() is a wrapper around the static function
  * __zio_execute() so that we can force  __zio_execute() to be
  * inlined.  This reduces stack overhead which is important
  * because __zio_execute() is called recursively in several zio
  * code paths.  zio_execute() itself cannot be inlined because
  * it is externally visible.
  */
 void
 zio_execute(void *zio)
 {
 	fstrans_cookie_t cookie;
 
 	cookie = spl_fstrans_mark();
 	__zio_execute(zio);
 	spl_fstrans_unmark(cookie);
 }
 
 /*
  * Used to determine if in the current context the stack is sized large
  * enough to allow zio_execute() to be called recursively.  A minimum
  * stack size of 16K is required to avoid needing to re-dispatch the zio.
  */
 static boolean_t
 zio_execute_stack_check(zio_t *zio)
 {
 #if !defined(HAVE_LARGE_STACKS)
 	dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
 
 	/* Executing in txg_sync_thread() context. */
 	if (dp && curthread == dp->dp_tx.tx_sync_thread)
 		return (B_TRUE);
 
 	/* Pool initialization outside of zio_taskq context. */
 	if (dp && spa_is_initializing(dp->dp_spa) &&
 	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
 	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))
 		return (B_TRUE);
 #else
 	(void) zio;
 #endif /* HAVE_LARGE_STACKS */
 
 	return (B_FALSE);
 }
 
 __attribute__((always_inline))
 static inline void
 __zio_execute(zio_t *zio)
 {
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 
 	while (zio->io_stage < ZIO_STAGE_DONE) {
 		enum zio_stage pipeline = zio->io_pipeline;
 		enum zio_stage stage = zio->io_stage;
 
 		zio->io_executor = curthread;
 
 		ASSERT(!MUTEX_HELD(&zio->io_lock));
 		ASSERT(ISP2(stage));
 		ASSERT(zio->io_stall == NULL);
 
 		do {
 			stage <<= 1;
 		} while ((stage & pipeline) == 0);
 
 		ASSERT(stage <= ZIO_STAGE_DONE);
 
 		/*
 		 * If we are in interrupt context and this pipeline stage
 		 * will grab a config lock that is held across I/O,
 		 * or may wait for an I/O that needs an interrupt thread
 		 * to complete, issue async to avoid deadlock.
 		 *
 		 * For VDEV_IO_START, we cut in line so that the io will
 		 * be sent to disk promptly.
 		 */
 		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
 		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
 			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
 			    zio_requeue_io_start_cut_in_line : B_FALSE;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
 
 		/*
 		 * If the current context doesn't have large enough stacks
 		 * the zio must be issued asynchronously to prevent overflow.
 		 */
 		if (zio_execute_stack_check(zio)) {
 			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
 			    zio_requeue_io_start_cut_in_line : B_FALSE;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
 
 		zio->io_stage = stage;
 		zio->io_pipeline_trace |= zio->io_stage;
 
 		/*
 		 * The zio pipeline stage returns the next zio to execute
 		 * (typically the same as this one), or NULL if we should
 		 * stop.
 		 */
 		zio = zio_pipeline[highbit64(stage) - 1](zio);
 
 		if (zio == NULL)
 			return;
 	}
 }
 
 
 /*
  * ==========================================================================
  * Initiate I/O, either sync or async
  * ==========================================================================
  */
 int
 zio_wait(zio_t *zio)
 {
 	/*
 	 * Some routines, like zio_free_sync(), may return a NULL zio
 	 * to avoid the performance overhead of creating and then destroying
 	 * an unneeded zio.  For the callers' simplicity, we accept a NULL
 	 * zio and ignore it.
 	 */
 	if (zio == NULL)
 		return (0);
 
 	long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms);
 	int error;
 
 	ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN);
 	ASSERT3P(zio->io_executor, ==, NULL);
 
 	zio->io_waiter = curthread;
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		spa_select_allocator(zio);
 	}
 	__zio_execute(zio);
 
 	mutex_enter(&zio->io_lock);
 	while (zio->io_executor != NULL) {
 		error = cv_timedwait_io(&zio->io_cv, &zio->io_lock,
 		    ddi_get_lbolt() + timeout);
 
 		if (zfs_deadman_enabled && error == -1 &&
 		    gethrtime() - zio->io_queued_timestamp >
 		    spa_deadman_ziotime(zio->io_spa)) {
 			mutex_exit(&zio->io_lock);
 			timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms);
 			zio_deadman(zio, FTAG);
 			mutex_enter(&zio->io_lock);
 		}
 	}
 	mutex_exit(&zio->io_lock);
 
 	error = zio->io_error;
 	zio_destroy(zio);
 
 	return (error);
 }
 
 void
 zio_nowait(zio_t *zio)
 {
 	/*
 	 * See comment in zio_wait().
 	 */
 	if (zio == NULL)
 		return;
 
 	ASSERT3P(zio->io_executor, ==, NULL);
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    list_is_empty(&zio->io_parent_list)) {
 		zio_t *pio;
 
 		/*
 		 * This is a logical async I/O with no parent to wait for it.
 		 * We add it to the spa_async_root_zio "Godfather" I/O which
 		 * will ensure they complete prior to unloading the pool.
 		 */
 		spa_t *spa = zio->io_spa;
 		pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE];
 
 		zio_add_child(pio, zio);
 	}
 
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		spa_select_allocator(zio);
 	}
 	__zio_execute(zio);
 }
 
 /*
  * ==========================================================================
  * Reexecute, cancel, or suspend/resume failed I/O
  * ==========================================================================
  */
 
 static void
 zio_reexecute(void *arg)
 {
 	zio_t *pio = arg;
 	zio_t *cio, *cio_next, *gio;
 
 	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
 	ASSERT(pio->io_gang_leader == NULL);
 	ASSERT(pio->io_gang_tree == NULL);
 
 	mutex_enter(&pio->io_lock);
 	pio->io_flags = pio->io_orig_flags;
 	pio->io_stage = pio->io_orig_stage;
 	pio->io_pipeline = pio->io_orig_pipeline;
 	pio->io_reexecute = 0;
 	pio->io_flags |= ZIO_FLAG_REEXECUTED;
 	pio->io_pipeline_trace = 0;
 	pio->io_error = 0;
 	pio->io_state[ZIO_WAIT_READY] = (pio->io_stage >= ZIO_STAGE_READY) ||
 	    (pio->io_pipeline & ZIO_STAGE_READY) == 0;
 	pio->io_state[ZIO_WAIT_DONE] = (pio->io_stage >= ZIO_STAGE_DONE);
 	zio_link_t *zl = NULL;
 	while ((gio = zio_walk_parents(pio, &zl)) != NULL) {
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++) {
 			gio->io_children[pio->io_child_type][w] +=
 			    !pio->io_state[w];
 		}
 	}
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		pio->io_child_error[c] = 0;
 
 	if (IO_IS_ALLOCATING(pio))
 		BP_ZERO(pio->io_bp);
 
 	/*
 	 * As we reexecute pio's children, new children could be created.
 	 * New children go to the head of pio's io_child_list, however,
 	 * so we will (correctly) not reexecute them.  The key is that
 	 * the remainder of pio's io_child_list, from 'cio_next' onward,
 	 * cannot be affected by any side effects of reexecuting 'cio'.
 	 */
 	zl = NULL;
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
 		mutex_exit(&pio->io_lock);
 		zio_reexecute(cio);
 		mutex_enter(&pio->io_lock);
 	}
 	mutex_exit(&pio->io_lock);
 
 	/*
 	 * Now that all children have been reexecuted, execute the parent.
 	 * We don't reexecute "The Godfather" I/O here as it's the
 	 * responsibility of the caller to wait on it.
 	 */
 	if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
 		pio->io_queued_timestamp = gethrtime();
 		__zio_execute(pio);
 	}
 }
 
 void
 zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
 {
 	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
 		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
 		    "failure and the failure mode property for this pool "
 		    "is set to panic.", spa_name(spa));
 
 	cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
 	    "failure and has been suspended.\n", spa_name(spa));
 
 	(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
 	    NULL, NULL, 0);
 
 	mutex_enter(&spa->spa_suspend_lock);
 
 	if (spa->spa_suspend_zio_root == NULL)
 		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 
 	spa->spa_suspended = reason;
 
 	if (zio != NULL) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 		ASSERT(zio != spa->spa_suspend_zio_root);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(zio_unique_parent(zio) == NULL);
 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
 		zio_add_child(spa->spa_suspend_zio_root, zio);
 	}
 
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
 int
 zio_resume(spa_t *spa)
 {
 	zio_t *pio;
 
 	/*
 	 * Reexecute all previously suspended i/o.
 	 */
 	mutex_enter(&spa->spa_suspend_lock);
 	spa->spa_suspended = ZIO_SUSPEND_NONE;
 	cv_broadcast(&spa->spa_suspend_cv);
 	pio = spa->spa_suspend_zio_root;
 	spa->spa_suspend_zio_root = NULL;
 	mutex_exit(&spa->spa_suspend_lock);
 
 	if (pio == NULL)
 		return (0);
 
 	zio_reexecute(pio);
 	return (zio_wait(pio));
 }
 
 void
 zio_resume_wait(spa_t *spa)
 {
 	mutex_enter(&spa->spa_suspend_lock);
 	while (spa_suspended(spa))
 		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
 /*
  * ==========================================================================
  * Gang blocks.
  *
  * A gang block is a collection of small blocks that looks to the DMU
  * like one large block.  When zio_dva_allocate() cannot find a block
  * of the requested size, due to either severe fragmentation or the pool
  * being nearly full, it calls zio_write_gang_block() to construct the
  * block from smaller fragments.
  *
  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
  * an indirect block: it's an array of block pointers.  It consumes
  * only one sector and hence is allocatable regardless of fragmentation.
  * The gang header's bps point to its gang members, which hold the data.
  *
  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
  * as the verifier to ensure uniqueness of the SHA256 checksum.
  * Critically, the gang block bp's blk_cksum is the checksum of the data,
  * not the gang header.  This ensures that data block signatures (needed for
  * deduplication) are independent of how the block is physically stored.
  *
  * Gang blocks can be nested: a gang member may itself be a gang block.
  * Thus every gang block is a tree in which root and all interior nodes are
  * gang headers, and the leaves are normal blocks that contain user data.
  * The root of the gang tree is called the gang leader.
  *
  * To perform any operation (read, rewrite, free, claim) on a gang block,
  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
  * in the io_gang_tree field of the original logical i/o by recursively
  * reading the gang leader and all gang headers below it.  This yields
  * an in-core tree containing the contents of every gang header and the
  * bps for every constituent of the gang block.
  *
  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
  * of the gang header plus zio_checksum_compute() of the data to update the
  * gang header's blk_cksum as described above.
  *
  * The two-phase assemble/issue model solves the problem of partial failure --
  * what if you'd freed part of a gang block but then couldn't read the
  * gang header for another part?  Assembling the entire gang tree first
  * ensures that all the necessary gang header I/O has succeeded before
  * starting the actual work of free, claim, or write.  Once the gang tree
  * is assembled, free and claim are in-memory operations that cannot fail.
  *
  * In the event that a gang write fails, zio_dva_unallocate() walks the
  * gang tree to immediately free (i.e. insert back into the space map)
  * everything we've allocated.  This ensures that we don't get ENOSPC
  * errors during repeated suspend/resume cycles due to a flaky device.
  *
  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
  * the gang tree, we won't modify the block, so we can safely defer the free
  * (knowing that the block is still intact).  If we *can* assemble the gang
  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
  * each constituent bp and we can allocate a new block on the next sync pass.
  *
  * In all cases, the gang tree allows complete recovery from partial failure.
  * ==========================================================================
  */
 
 static void
 zio_gang_issue_func_done(zio_t *zio)
 {
 	abd_free(zio->io_abd);
 }
 
 static zio_t *
 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	if (gn != NULL)
 		return (pio);
 
 	return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
 	    BP_GET_PSIZE(bp), zio_gang_issue_func_done,
 	    NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 	    &pio->io_bookmark));
 }
 
 static zio_t *
 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	zio_t *zio;
 
 	if (gn != NULL) {
 		abd_t *gbh_abd =
 		    abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
 		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 		    &pio->io_bookmark);
 		/*
 		 * As we rewrite each gang header, the pipeline will compute
 		 * a new gang block header checksum for it; but no one will
 		 * compute a new data checksum, so we do that here.  The one
 		 * exception is the gang leader: the pipeline already computed
 		 * its data checksum because that stage precedes gang assembly.
 		 * (Presently, nothing actually uses interior data checksums;
 		 * this is just good hygiene.)
 		 */
 		if (gn != pio->io_gang_leader->io_gang_tree) {
 			abd_t *buf = abd_get_offset(data, offset);
 
 			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
 			    buf, BP_GET_PSIZE(bp));
 
 			abd_free(buf);
 		}
 		/*
 		 * If we are here to damage data for testing purposes,
 		 * leave the GBH alone so that we can detect the damage.
 		 */
 		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 	} else {
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    abd_get_offset(data, offset), BP_GET_PSIZE(bp),
 		    zio_gang_issue_func_done, NULL, pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 	}
 
 	return (zio);
 }
 
 static zio_t *
 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	(void) gn, (void) data, (void) offset;
 
 	zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
 	    ZIO_GANG_CHILD_FLAGS(pio));
 	if (zio == NULL) {
 		zio = zio_null(pio, pio->io_spa,
 		    NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio));
 	}
 	return (zio);
 }
 
 static zio_t *
 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	(void) gn, (void) data, (void) offset;
 	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
 	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
 }
 
 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
 	NULL,
 	zio_read_gang,
 	zio_rewrite_gang,
 	zio_free_gang,
 	zio_claim_gang,
 	NULL
 };
 
 static void zio_gang_tree_assemble_done(zio_t *zio);
 
 static zio_gang_node_t *
 zio_gang_node_alloc(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn;
 
 	ASSERT(*gnpp == NULL);
 
 	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
 	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
 	*gnpp = gn;
 
 	return (gn);
 }
 
 static void
 zio_gang_node_free(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = *gnpp;
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
 		ASSERT(gn->gn_child[g] == NULL);
 
 	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 	kmem_free(gn, sizeof (*gn));
 	*gnpp = NULL;
 }
 
 static void
 zio_gang_tree_free(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = *gnpp;
 
 	if (gn == NULL)
 		return;
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
 		zio_gang_tree_free(&gn->gn_child[g]);
 
 	zio_gang_node_free(gnpp);
 }
 
 static void
 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
 	abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 
 	ASSERT(gio->io_gang_leader == gio);
 	ASSERT(BP_IS_GANG(bp));
 
 	zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
 	    zio_gang_tree_assemble_done, gn, gio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
 }
 
 static void
 zio_gang_tree_assemble_done(zio_t *zio)
 {
 	zio_t *gio = zio->io_gang_leader;
 	zio_gang_node_t *gn = zio->io_private;
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(gio == zio_unique_parent(zio));
 	ASSERT(list_is_empty(&zio->io_child_list));
 
 	if (zio->io_error)
 		return;
 
 	/* this ABD was created from a linear buf in zio_gang_tree_assemble */
 	if (BP_SHOULD_BYTESWAP(bp))
 		byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
 
 	ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
 	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 	abd_free(zio->io_abd);
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 		if (!BP_IS_GANG(gbp))
 			continue;
 		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
 	}
 }
 
 static void
 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
     uint64_t offset)
 {
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 
 	ASSERT(BP_IS_GANG(bp) == !!gn);
 	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
 	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
 
 	/*
 	 * If you're a gang header, your data is in gn->gn_gbh.
 	 * If you're a gang member, your data is in 'data' and gn == NULL.
 	 */
 	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
 
 	if (gn != NULL) {
 		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 			if (BP_IS_HOLE(gbp))
 				continue;
 			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
 			    offset);
 			offset += BP_GET_PSIZE(gbp);
 		}
 	}
 
 	if (gn == gio->io_gang_tree)
 		ASSERT3U(gio->io_size, ==, offset);
 
 	if (zio != pio)
 		zio_nowait(zio);
 }
 
 static zio_t *
 zio_gang_assemble(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	zio->io_gang_leader = zio;
 
 	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
 
 	return (zio);
 }
 
 static zio_t *
 zio_gang_issue(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
 		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
 		    0);
 	else
 		zio_gang_tree_free(&zio->io_gang_tree);
 
 	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	return (zio);
 }
 
 static void
 zio_gang_inherit_allocator(zio_t *pio, zio_t *cio)
 {
 	cio->io_allocator = pio->io_allocator;
 	cio->io_wr_iss_tq = pio->io_wr_iss_tq;
 }
 
 static void
 zio_write_gang_member_ready(zio_t *zio)
 {
 	zio_t *pio = zio_unique_parent(zio);
 	dva_t *cdva = zio->io_bp->blk_dva;
 	dva_t *pdva = pio->io_bp->blk_dva;
 	uint64_t asize;
 	zio_t *gio __maybe_unused = zio->io_gang_leader;
 
 	if (BP_IS_HOLE(zio->io_bp))
 		return;
 
 	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
 
 	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
 	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
 	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
 	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
 	VERIFY3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
 
 	mutex_enter(&pio->io_lock);
 	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
 		ASSERT(DVA_GET_GANG(&pdva[d]));
 		asize = DVA_GET_ASIZE(&pdva[d]);
 		asize += DVA_GET_ASIZE(&cdva[d]);
 		DVA_SET_ASIZE(&pdva[d], asize);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 static void
 zio_write_gang_done(zio_t *zio)
 {
 	/*
 	 * The io_abd field will be NULL for a zio with no data.  The io_flags
 	 * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't
 	 * check for it here as it is cleared in zio_ready.
 	 */
 	if (zio->io_abd != NULL)
 		abd_free(zio->io_abd);
 }
 
 static zio_t *
 zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 {
 	spa_t *spa = pio->io_spa;
 	blkptr_t *bp = pio->io_bp;
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 	zio_gang_node_t *gn, **gnpp;
 	zio_gbh_phys_t *gbh;
 	abd_t *gbh_abd;
 	uint64_t txg = pio->io_txg;
 	uint64_t resid = pio->io_size;
 	uint64_t lsize;
 	int copies = gio->io_prop.zp_copies;
 	zio_prop_t zp;
 	int error;
 	boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
 
 	/*
 	 * If one copy was requested, store 2 copies of the GBH, so that we
 	 * can still traverse all the data (e.g. to free or scrub) even if a
 	 * block is damaged.  Note that we can't store 3 copies of the GBH in
 	 * all cases, e.g. with encryption, which uses DVA[2] for the IV+salt.
 	 */
 	int gbh_copies = copies;
 	if (gbh_copies == 1) {
 		gbh_copies = MIN(2, spa_max_replication(spa));
 	}
 
 	ASSERT(ZIO_HAS_ALLOCATOR(pio));
 	int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
 	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 		ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(has_data);
 
 		flags |= METASLAB_ASYNC_ALLOC;
 		VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator].
 		    mca_alloc_slots, pio));
 
 		/*
 		 * The logical zio has already placed a reservation for
 		 * 'copies' allocation slots but gang blocks may require
 		 * additional copies. These additional copies
 		 * (i.e. gbh_copies - copies) are guaranteed to succeed
 		 * since metaslab_class_throttle_reserve() always allows
 		 * additional reservations for gang blocks.
 		 */
 		VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
 		    pio->io_allocator, pio, flags));
 	}
 
 	error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
 	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
 	    &pio->io_alloc_list, pio, pio->io_allocator);
 	if (error) {
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(has_data);
 
 			/*
 			 * If we failed to allocate the gang block header then
 			 * we remove any additional allocation reservations that
 			 * we placed here. The original reservation will
 			 * be removed when the logical I/O goes to the ready
 			 * stage.
 			 */
 			metaslab_class_throttle_unreserve(mc,
 			    gbh_copies - copies, pio->io_allocator, pio);
 		}
 
 		pio->io_error = error;
 		return (pio);
 	}
 
 	if (pio == gio) {
 		gnpp = &gio->io_gang_tree;
 	} else {
 		gnpp = pio->io_private;
 		ASSERT(pio->io_ready == zio_write_gang_member_ready);
 	}
 
 	gn = zio_gang_node_alloc(gnpp);
 	gbh = gn->gn_gbh;
 	memset(gbh, 0, SPA_GANGBLOCKSIZE);
 	gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
 
 	/*
 	 * Create the gang header.
 	 */
 	zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
 	    zio_write_gang_done, NULL, pio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
 	zio_gang_inherit_allocator(pio, zio);
 
 	/*
 	 * Create and nowait the gang children.
 	 */
 	for (int g = 0; resid != 0; resid -= lsize, g++) {
 		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
 		    SPA_MINBLOCKSIZE);
 		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
 
 		zp.zp_checksum = gio->io_prop.zp_checksum;
 		zp.zp_compress = ZIO_COMPRESS_OFF;
 		zp.zp_complevel = gio->io_prop.zp_complevel;
 		zp.zp_type = DMU_OT_NONE;
 		zp.zp_level = 0;
 		zp.zp_copies = gio->io_prop.zp_copies;
 		zp.zp_dedup = B_FALSE;
 		zp.zp_dedup_verify = B_FALSE;
 		zp.zp_nopwrite = B_FALSE;
 		zp.zp_encrypt = gio->io_prop.zp_encrypt;
 		zp.zp_byteorder = gio->io_prop.zp_byteorder;
 		memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN);
 		memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN);
 		memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
 
 		zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
 		    has_data ? abd_get_offset(pio->io_abd, pio->io_size -
 		    resid) : NULL, lsize, lsize, &zp,
 		    zio_write_gang_member_ready, NULL,
 		    zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
 		zio_gang_inherit_allocator(zio, cio);
 
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(has_data);
 
 			/*
 			 * Gang children won't throttle but we should
 			 * account for their work, so reserve an allocation
 			 * slot for them here.
 			 */
 			VERIFY(metaslab_class_throttle_reserve(mc,
 			    zp.zp_copies, cio->io_allocator, cio, flags));
 		}
 		zio_nowait(cio);
 	}
 
 	/*
 	 * Set pio's pipeline to just wait for zio to finish.
 	 */
 	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	zio_nowait(zio);
 
 	return (pio);
 }
 
 /*
  * The zio_nop_write stage in the pipeline determines if allocating a
  * new bp is necessary.  The nopwrite feature can handle writes in
  * either syncing or open context (i.e. zil writes) and as a result is
  * mutually exclusive with dedup.
  *
  * By leveraging a cryptographically secure checksum, such as SHA256, we
  * can compare the checksums of the new data and the old to determine if
  * allocating a new block is required.  Note that our requirements for
  * cryptographic strength are fairly weak: there can't be any accidental
  * hash collisions, but we don't need to be secure against intentional
  * (malicious) collisions.  To trigger a nopwrite, you have to be able
  * to write the file to begin with, and triggering an incorrect (hash
  * collision) nopwrite is no worse than simply writing to the file.
  * That said, there are no known attacks against the checksum algorithms
  * used for nopwrite, assuming that the salt and the checksums
  * themselves remain secret.
  */
 static zio_t *
 zio_nop_write(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	zio_prop_t *zp = &zio->io_prop;
 
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT(BP_GET_LEVEL(bp) == 0);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 	ASSERT(zp->zp_nopwrite);
 	ASSERT(!zp->zp_dedup);
 	ASSERT(zio->io_bp_override == NULL);
 	ASSERT(IO_IS_ALLOCATING(zio));
 
 	/*
 	 * Check to see if the original bp and the new bp have matching
 	 * characteristics (i.e. same checksum, compression algorithms, etc).
 	 * If they don't then just continue with the pipeline which will
 	 * allocate a new bp.
 	 */
 	if (BP_IS_HOLE(bp_orig) ||
 	    !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
 	    ZCHECKSUM_FLAG_NOPWRITE) ||
 	    BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) ||
 	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
 	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
 	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
 	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
 		return (zio);
 
 	/*
 	 * If the checksums match then reset the pipeline so that we
 	 * avoid allocating a new bp and issuing any I/O.
 	 */
 	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
 		ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_NOPWRITE);
 		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
 		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
 		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
 		ASSERT3U(bp->blk_prop, ==, bp_orig->blk_prop);
 
 		/*
 		 * If we're overwriting a block that is currently on an
 		 * indirect vdev, then ignore the nopwrite request and
 		 * allow a new block to be allocated on a concrete vdev.
 		 */
 		spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER);
 		for (int d = 0; d < BP_GET_NDVAS(bp_orig); d++) {
 			vdev_t *tvd = vdev_lookup_top(zio->io_spa,
 			    DVA_GET_VDEV(&bp_orig->blk_dva[d]));
 			if (tvd->vdev_ops == &vdev_indirect_ops) {
 				spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
 				return (zio);
 			}
 		}
 		spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
 
 		*bp = *bp_orig;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		zio->io_flags |= ZIO_FLAG_NOPWRITE;
 	}
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Block Reference Table
  * ==========================================================================
  */
 static zio_t *
 zio_brt_free(zio_t *zio)
 {
 	blkptr_t *bp;
 
 	bp = zio->io_bp;
 
 	if (BP_GET_LEVEL(bp) > 0 ||
 	    BP_IS_METADATA(bp) ||
 	    !brt_maybe_exists(zio->io_spa, bp)) {
 		return (zio);
 	}
 
 	if (!brt_entry_decref(zio->io_spa, bp)) {
 		/*
 		 * This isn't the last reference, so we cannot free
 		 * the data yet.
 		 */
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 	}
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Dedup
  * ==========================================================================
  */
 static void
 zio_ddt_child_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp;
 	zio_t *pio = zio_unique_parent(zio);
 
 	mutex_enter(&pio->io_lock);
 	ddp = ddt_phys_select(dde, bp);
 	if (zio->io_error == 0)
 		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
 
 	if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
 		dde->dde_repair_abd = zio->io_abd;
 	else
 		abd_free(zio->io_abd);
 	mutex_exit(&pio->io_lock);
 }
 
 static zio_t *
 zio_ddt_read_start(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
 		ddt_phys_t *ddp = dde->dde_phys;
 		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
 		blkptr_t blk;
 
 		ASSERT(zio->io_vsd == NULL);
 		zio->io_vsd = dde;
 
 		if (ddp_self == NULL)
 			return (zio);
 
 		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
 				continue;
 			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
 			    &blk);
 			zio_nowait(zio_read(zio, zio->io_spa, &blk,
 			    abd_alloc_for_io(zio->io_size, B_TRUE),
 			    zio->io_size, zio_ddt_child_read_done, dde,
 			    zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
 			    ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
 		}
 		return (zio);
 	}
 
 	zio_nowait(zio_read(zio, zio->io_spa, bp,
 	    zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
 	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
 
 	return (zio);
 }
 
 static zio_t *
 zio_ddt_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = zio->io_vsd;
 		if (ddt == NULL) {
 			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
 			return (zio);
 		}
 		if (dde == NULL) {
 			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 			return (NULL);
 		}
 		if (dde->dde_repair_abd != NULL) {
 			abd_copy(zio->io_abd, dde->dde_repair_abd,
 			    zio->io_size);
 			zio->io_child_error[ZIO_CHILD_DDT] = 0;
 		}
 		ddt_repair_done(ddt, dde);
 		zio->io_vsd = NULL;
 	}
 
 	ASSERT(zio->io_vsd == NULL);
 
 	return (zio);
 }
 
 static boolean_t
 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 {
 	spa_t *spa = zio->io_spa;
 	boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW);
 
 	ASSERT(!(zio->io_bp_override && do_raw));
 
 	/*
 	 * Note: we compare the original data, not the transformed data,
 	 * because when zio->io_bp is an override bp, we will not have
 	 * pushed the I/O transforms.  That's an important optimization
 	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
 	 * However, we should never get a raw, override zio so in these
 	 * cases we can compare the io_abd directly. This is useful because
 	 * it allows us to do dedup verification even if we don't have access
 	 * to the original data (for instance, if the encryption keys aren't
 	 * loaded).
 	 */
 
 	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
 		zio_t *lio = dde->dde_lead_zio[p];
 
 		if (lio != NULL && do_raw) {
 			return (lio->io_size != zio->io_size ||
 			    abd_cmp(zio->io_abd, lio->io_abd) != 0);
 		} else if (lio != NULL) {
 			return (lio->io_orig_size != zio->io_orig_size ||
 			    abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
 		}
 	}
 
 	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
 		ddt_phys_t *ddp = &dde->dde_phys[p];
 
 		if (ddp->ddp_phys_birth != 0 && do_raw) {
 			blkptr_t blk = *zio->io_bp;
 			uint64_t psize;
 			abd_t *tmpabd;
 			int error;
 
 			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
 			psize = BP_GET_PSIZE(&blk);
 
 			if (psize != zio->io_size)
 				return (B_TRUE);
 
 			ddt_exit(ddt);
 
 			tmpabd = abd_alloc_for_io(psize, B_TRUE);
 
 			error = zio_wait(zio_read(NULL, spa, &blk, tmpabd,
 			    psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 			    ZIO_FLAG_RAW, &zio->io_bookmark));
 
 			if (error == 0) {
 				if (abd_cmp(tmpabd, zio->io_abd) != 0)
 					error = SET_ERROR(ENOENT);
 			}
 
 			abd_free(tmpabd);
 			ddt_enter(ddt);
 			return (error != 0);
 		} else if (ddp->ddp_phys_birth != 0) {
 			arc_buf_t *abuf = NULL;
 			arc_flags_t aflags = ARC_FLAG_WAIT;
 			blkptr_t blk = *zio->io_bp;
 			int error;
 
 			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
 
 			if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
 				return (B_TRUE);
 
 			ddt_exit(ddt);
 
 			error = arc_read(NULL, spa, &blk,
 			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 			    &aflags, &zio->io_bookmark);
 
 			if (error == 0) {
 				if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
 				    zio->io_orig_size) != 0)
 					error = SET_ERROR(ENOENT);
 				arc_buf_destroy(abuf, &abuf);
 			}
 
 			ddt_enter(ddt);
 			return (error != 0);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static void
 zio_ddt_child_write_ready(zio_t *zio)
 {
 	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp = &dde->dde_phys[p];
 	zio_t *pio;
 
 	if (zio->io_error)
 		return;
 
 	ddt_enter(ddt);
 
 	ASSERT(dde->dde_lead_zio[p] == zio);
 
 	ddt_phys_fill(ddp, zio->io_bp);
 
 	zio_link_t *zl = NULL;
 	while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
 
 	ddt_exit(ddt);
 }
 
 static void
 zio_ddt_child_write_done(zio_t *zio)
 {
 	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp = &dde->dde_phys[p];
 
 	ddt_enter(ddt);
 
 	ASSERT(ddp->ddp_refcnt == 0);
 	ASSERT(dde->dde_lead_zio[p] == zio);
 	dde->dde_lead_zio[p] = NULL;
 
 	if (zio->io_error == 0) {
 		zio_link_t *zl = NULL;
 		while (zio_walk_parents(zio, &zl) != NULL)
 			ddt_phys_addref(ddp);
 	} else {
 		ddt_phys_clear(ddp);
 	}
 
 	ddt_exit(ddt);
 }
 
 static zio_t *
 zio_ddt_write(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t txg = zio->io_txg;
 	zio_prop_t *zp = &zio->io_prop;
 	int p = zp->zp_copies;
 	zio_t *cio = NULL;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
 	ddt_phys_t *ddp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
 	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
 	ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
 
 	ddt_enter(ddt);
 	dde = ddt_lookup(ddt, bp, B_TRUE);
 	ddp = &dde->dde_phys[p];
 
 	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
 		/*
 		 * If we're using a weak checksum, upgrade to a strong checksum
 		 * and try again.  If we're already using a strong checksum,
 		 * we can't resolve it, so just convert to an ordinary write.
 		 * (And automatically e-mail a paper to Nature?)
 		 */
 		if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP)) {
 			zp->zp_checksum = spa_dedup_checksum(spa);
 			zio_pop_transforms(zio);
 			zio->io_stage = ZIO_STAGE_OPEN;
 			BP_ZERO(bp);
 		} else {
 			zp->zp_dedup = B_FALSE;
 			BP_SET_DEDUP(bp, B_FALSE);
 		}
 		ASSERT(!BP_GET_DEDUP(bp));
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 		ddt_exit(ddt);
 		return (zio);
 	}
 
 	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
 		if (ddp->ddp_phys_birth != 0)
 			ddt_bp_fill(ddp, bp, txg);
 		if (dde->dde_lead_zio[p] != NULL)
 			zio_add_child(zio, dde->dde_lead_zio[p]);
 		else
 			ddt_phys_addref(ddp);
 	} else if (zio->io_bp_override) {
-		ASSERT(bp->blk_birth == txg);
+		ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
 		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
 		ddt_phys_fill(ddp, bp);
 		ddt_phys_addref(ddp);
 	} else {
 		cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
 		    zio->io_orig_size, zio->io_orig_size, zp,
 		    zio_ddt_child_write_ready, NULL,
 		    zio_ddt_child_write_done, dde, zio->io_priority,
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
 		zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
 		dde->dde_lead_zio[p] = cio;
 	}
 
 	ddt_exit(ddt);
 
 	zio_nowait(cio);
 
 	return (zio);
 }
 
 static ddt_entry_t *freedde; /* for debugging */
 
 static zio_t *
 zio_ddt_free(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
 	ddt_phys_t *ddp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	ddt_enter(ddt);
 	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
 	if (dde) {
 		ddp = ddt_phys_select(dde, bp);
 		if (ddp)
 			ddt_phys_decref(ddp);
 	}
 	ddt_exit(ddt);
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Allocate and free blocks
  * ==========================================================================
  */
 
 static zio_t *
 zio_io_to_allocate(spa_t *spa, int allocator)
 {
 	zio_t *zio;
 
 	ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock));
 
 	zio = avl_first(&spa->spa_allocs[allocator].spaa_tree);
 	if (zio == NULL)
 		return (NULL);
 
 	ASSERT(IO_IS_ALLOCATING(zio));
 	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 	/*
 	 * Try to place a reservation for this zio. If we're unable to
 	 * reserve then we throttle.
 	 */
 	ASSERT3U(zio->io_allocator, ==, allocator);
 	if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
 	    zio->io_prop.zp_copies, allocator, zio, 0)) {
 		return (NULL);
 	}
 
 	avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio);
 	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
 
 	return (zio);
 }
 
 static zio_t *
 zio_dva_throttle(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_t *nio;
 	metaslab_class_t *mc;
 
 	/* locate an appropriate allocation class */
 	mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type,
 	    zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk);
 
 	if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
 	    !mc->mc_alloc_throttle_enabled ||
 	    zio->io_child_type == ZIO_CHILD_GANG ||
 	    zio->io_flags & ZIO_FLAG_NODATA) {
 		return (zio);
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
 
 	int allocator = zio->io_allocator;
 	zio->io_metaslab_class = mc;
 	mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
 	avl_add(&spa->spa_allocs[allocator].spaa_tree, zio);
 	nio = zio_io_to_allocate(spa, allocator);
 	mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
 	return (nio);
 }
 
 static void
 zio_allocate_dispatch(spa_t *spa, int allocator)
 {
 	zio_t *zio;
 
 	mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
 	zio = zio_io_to_allocate(spa, allocator);
 	mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
 	if (zio == NULL)
 		return;
 
 	ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
 	ASSERT0(zio->io_error);
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
 }
 
 static zio_t *
 zio_dva_allocate(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	metaslab_class_t *mc;
 	blkptr_t *bp = zio->io_bp;
 	int error;
 	int flags = 0;
 
 	if (zio->io_gang_leader == NULL) {
 		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 		zio->io_gang_leader = zio;
 	}
 
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT0(BP_GET_NDVAS(bp));
 	ASSERT3U(zio->io_prop.zp_copies, >, 0);
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
 	if (zio->io_flags & ZIO_FLAG_NODATA)
 		flags |= METASLAB_DONT_THROTTLE;
 	if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
 		flags |= METASLAB_GANG_CHILD;
 	if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
 		flags |= METASLAB_ASYNC_ALLOC;
 
 	/*
 	 * if not already chosen, locate an appropriate allocation class
 	 */
 	mc = zio->io_metaslab_class;
 	if (mc == NULL) {
 		mc = spa_preferred_class(spa, zio->io_size,
 		    zio->io_prop.zp_type, zio->io_prop.zp_level,
 		    zio->io_prop.zp_zpl_smallblk);
 		zio->io_metaslab_class = mc;
 	}
 
 	/*
 	 * Try allocating the block in the usual metaslab class.
 	 * If that's full, allocate it in the normal class.
 	 * If that's full, allocate as a gang block,
 	 * and if all are full, the allocation fails (which shouldn't happen).
 	 *
 	 * Note that we do not fall back on embedded slog (ZIL) space, to
 	 * preserve unfragmented slog space, which is critical for decent
 	 * sync write performance.  If a log allocation fails, we will fall
 	 * back to spa_sync() which is abysmal for performance.
 	 */
 	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
 	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
 	    &zio->io_alloc_list, zio, zio->io_allocator);
 
 	/*
 	 * Fallback to normal class when an alloc class is full
 	 */
 	if (error == ENOSPC && mc != spa_normal_class(spa)) {
 		/*
 		 * If throttling, transfer reservation over to normal class.
 		 * The io_allocator slot can remain the same even though we
 		 * are switching classes.
 		 */
 		if (mc->mc_alloc_throttle_enabled &&
 		    (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) {
 			metaslab_class_throttle_unreserve(mc,
 			    zio->io_prop.zp_copies, zio->io_allocator, zio);
 			zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
 
 			VERIFY(metaslab_class_throttle_reserve(
 			    spa_normal_class(spa),
 			    zio->io_prop.zp_copies, zio->io_allocator, zio,
 			    flags | METASLAB_MUST_RESERVE));
 		}
 		zio->io_metaslab_class = mc = spa_normal_class(spa);
 		if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
 			zfs_dbgmsg("%s: metaslab allocation failure, "
 			    "trying normal class: zio %px, size %llu, error %d",
 			    spa_name(spa), zio, (u_longlong_t)zio->io_size,
 			    error);
 		}
 
 		error = metaslab_alloc(spa, mc, zio->io_size, bp,
 		    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
 		    &zio->io_alloc_list, zio, zio->io_allocator);
 	}
 
 	if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
 		if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
 			zfs_dbgmsg("%s: metaslab allocation failure, "
 			    "trying ganging: zio %px, size %llu, error %d",
 			    spa_name(spa), zio, (u_longlong_t)zio->io_size,
 			    error);
 		}
 		return (zio_write_gang_block(zio, mc));
 	}
 	if (error != 0) {
 		if (error != ENOSPC ||
 		    (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) {
 			zfs_dbgmsg("%s: metaslab allocation failure: zio %px, "
 			    "size %llu, error %d",
 			    spa_name(spa), zio, (u_longlong_t)zio->io_size,
 			    error);
 		}
 		zio->io_error = error;
 	}
 
 	return (zio);
 }
 
 static zio_t *
 zio_dva_free(zio_t *zio)
 {
 	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
 
 	return (zio);
 }
 
 static zio_t *
 zio_dva_claim(zio_t *zio)
 {
 	int error;
 
 	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
 	if (error)
 		zio->io_error = error;
 
 	return (zio);
 }
 
 /*
  * Undo an allocation.  This is used by zio_done() when an I/O fails
  * and we want to give back the block we just allocated.
  * This handles both normal blocks and gang blocks.
  */
 static void
 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 {
-	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
+	ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp));
 	ASSERT(zio->io_bp_override == NULL);
 
-	if (!BP_IS_HOLE(bp))
-		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
+	if (!BP_IS_HOLE(bp)) {
+		metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp),
+		    B_TRUE);
+	}
 
 	if (gn != NULL) {
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			zio_dva_unallocate(zio, gn->gn_child[g],
 			    &gn->gn_gbh->zg_blkptr[g]);
 		}
 	}
 }
 
 /*
  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  */
 int
 zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
     uint64_t size, boolean_t *slog)
 {
 	int error = 1;
 	zio_alloc_list_t io_alloc_list;
 
 	ASSERT(txg > spa_syncing_txg(spa));
 
 	metaslab_trace_init(&io_alloc_list);
 
 	/*
 	 * Block pointer fields are useful to metaslabs for stats and debugging.
 	 * Fill in the obvious ones before calling into metaslab_alloc().
 	 */
 	BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
 	BP_SET_PSIZE(new_bp, size);
 	BP_SET_LEVEL(new_bp, 0);
 
 	/*
 	 * When allocating a zil block, we don't have information about
 	 * the final destination of the block except the objset it's part
 	 * of, so we just hash the objset ID to pick the allocator to get
 	 * some parallelism.
 	 */
 	int flags = METASLAB_ZIL;
 	int allocator = (uint_t)cityhash4(0, 0, 0,
 	    os->os_dsl_dataset->ds_object) % spa->spa_alloc_count;
 	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
 	    txg, NULL, flags, &io_alloc_list, NULL, allocator);
 	*slog = (error == 0);
 	if (error != 0) {
 		error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
 		    new_bp, 1, txg, NULL, flags,
 		    &io_alloc_list, NULL, allocator);
 	}
 	if (error != 0) {
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
 		    new_bp, 1, txg, NULL, flags,
 		    &io_alloc_list, NULL, allocator);
 	}
 	metaslab_trace_fini(&io_alloc_list);
 
 	if (error == 0) {
 		BP_SET_LSIZE(new_bp, size);
 		BP_SET_PSIZE(new_bp, size);
 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
 		BP_SET_CHECKSUM(new_bp,
 		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
 		    ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
 		BP_SET_LEVEL(new_bp, 0);
 		BP_SET_DEDUP(new_bp, 0);
 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
 
 		/*
 		 * encrypted blocks will require an IV and salt. We generate
 		 * these now since we will not be rewriting the bp at
 		 * rewrite time.
 		 */
 		if (os->os_encrypted) {
 			uint8_t iv[ZIO_DATA_IV_LEN];
 			uint8_t salt[ZIO_DATA_SALT_LEN];
 
 			BP_SET_CRYPT(new_bp, B_TRUE);
 			VERIFY0(spa_crypt_get_salt(spa,
 			    dmu_objset_id(os), salt));
 			VERIFY0(zio_crypt_generate_iv(iv));
 
 			zio_crypt_encode_params_bp(new_bp, salt, iv);
 		}
 	} else {
 		zfs_dbgmsg("%s: zil block allocation failure: "
 		    "size %llu, error %d", spa_name(spa), (u_longlong_t)size,
 		    error);
 	}
 
 	return (error);
 }
 
 /*
  * ==========================================================================
  * Read and write to physical devices
  * ==========================================================================
  */
 
 /*
  * Issue an I/O to the underlying vdev. Typically the issue pipeline
  * stops after this stage and will resume upon I/O completion.
  * However, there are instances where the vdev layer may need to
  * continue the pipeline when an I/O was not issued. Since the I/O
  * that was sent to the vdev layer might be different than the one
  * currently active in the pipeline (see vdev_queue_io()), we explicitly
  * force the underlying vdev layers to call either zio_execute() or
  * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
  */
 static zio_t *
 zio_vdev_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	uint64_t align;
 	spa_t *spa = zio->io_spa;
 
 	zio->io_delay = 0;
 
 	ASSERT(zio->io_error == 0);
 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
 
 	if (vd == NULL) {
 		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
 
 		/*
 		 * The mirror_ops handle multiple DVAs in a single BP.
 		 */
 		vdev_mirror_ops.vdev_op_io_start(zio);
 		return (NULL);
 	}
 
 	ASSERT3P(zio->io_logical, !=, zio);
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		ASSERT(spa->spa_trust_config);
 
 		/*
 		 * Note: the code can handle other kinds of writes,
 		 * but we don't expect them.
 		 */
 		if (zio->io_vd->vdev_noalloc) {
 			ASSERT(zio->io_flags &
 			    (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
 			    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
 		}
 	}
 
 	align = 1ULL << vd->vdev_top->vdev_ashift;
 
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
 	    P2PHASE(zio->io_size, align) != 0) {
 		/* Transform logical writes to be a full physical block size. */
 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
 		abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize);
 		ASSERT(vd == vd->vdev_top);
 		if (zio->io_type == ZIO_TYPE_WRITE) {
 			abd_copy(abuf, zio->io_abd, zio->io_size);
 			abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
 		}
 		zio_push_transform(zio, abuf, asize, asize, zio_subblock);
 	}
 
 	/*
 	 * If this is not a physical io, make sure that it is properly aligned
 	 * before proceeding.
 	 */
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
 		ASSERT0(P2PHASE(zio->io_offset, align));
 		ASSERT0(P2PHASE(zio->io_size, align));
 	} else {
 		/*
 		 * For physical writes, we allow 512b aligned writes and assume
 		 * the device will perform a read-modify-write as necessary.
 		 */
 		ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
 		ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
 	}
 
 	VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
 
 	/*
 	 * If this is a repair I/O, and there's no self-healing involved --
 	 * that is, we're just resilvering what we expect to resilver --
 	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
 	 * This prevents spurious resilvering.
 	 *
 	 * There are a few ways that we can end up creating these spurious
 	 * resilver i/os:
 	 *
 	 * 1. A resilver i/o will be issued if any DVA in the BP has a
 	 * dirty DTL.  The mirror code will issue resilver writes to
 	 * each DVA, including the one(s) that are not on vdevs with dirty
 	 * DTLs.
 	 *
 	 * 2. With nested replication, which happens when we have a
 	 * "replacing" or "spare" vdev that's a child of a mirror or raidz.
 	 * For example, given mirror(replacing(A+B), C), it's likely that
 	 * only A is out of date (it's the new device). In this case, we'll
 	 * read from C, then use the data to resilver A+B -- but we don't
 	 * actually want to resilver B, just A. The top-level mirror has no
 	 * way to know this, so instead we just discard unnecessary repairs
 	 * as we work our way down the vdev tree.
 	 *
 	 * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
 	 * The same logic applies to any form of nested replication: ditto
 	 * + mirror, RAID-Z + replacing, etc.
 	 *
 	 * However, indirect vdevs point off to other vdevs which may have
 	 * DTL's, so we never bypass them.  The child i/os on concrete vdevs
 	 * will be properly bypassed instead.
 	 *
 	 * Leaf DTL_PARTIAL can be empty when a legitimate write comes from
 	 * a dRAID spare vdev. For example, when a dRAID spare is first
 	 * used, its spare blocks need to be written to but the leaf vdev's
 	 * of such blocks can have empty DTL_PARTIAL.
 	 *
 	 * There seemed no clean way to allow such writes while bypassing
 	 * spurious ones. At this point, just avoid all bypassing for dRAID
 	 * for correctness.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
 	    zio->io_txg != 0 &&	/* not a delegated i/o */
 	    vd->vdev_ops != &vdev_indirect_ops &&
 	    vd->vdev_top->vdev_ops != &vdev_draid_ops &&
 	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		zio_vdev_io_bypass(zio);
 		return (zio);
 	}
 
 	/*
 	 * Select the next best leaf I/O to process.  Distributed spares are
 	 * excluded since they dispatch the I/O directly to a leaf vdev after
 	 * applying the dRAID mapping.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    vd->vdev_ops != &vdev_draid_spare_ops &&
 	    (zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_TRIM)) {
 
 		if ((zio = vdev_queue_io(zio)) == NULL)
 			return (NULL);
 
 		if (!vdev_accessible(vd, zio)) {
 			zio->io_error = SET_ERROR(ENXIO);
 			zio_interrupt(zio);
 			return (NULL);
 		}
 		zio->io_delay = gethrtime();
 	}
 
 	vd->vdev_ops->vdev_op_io_start(zio);
 	return (NULL);
 }
 
 static zio_t *
 zio_vdev_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
 	boolean_t unexpected_error = B_FALSE;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM);
 
 	if (zio->io_delay)
 		zio->io_delay = gethrtime() - zio->io_delay;
 
 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    vd->vdev_ops != &vdev_draid_spare_ops) {
 		vdev_queue_io_done(zio);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_device_injections(vd, zio,
 			    EIO, EILSEQ);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_label_injection(zio, EIO);
 
 		if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) {
 			if (!vdev_accessible(vd, zio)) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {
 				unexpected_error = B_TRUE;
 			}
 		}
 	}
 
 	ops->vdev_op_io_done(zio);
 
 	if (unexpected_error && vd->vdev_remove_wanted == B_FALSE)
 		VERIFY(vdev_probe(vd, zio) == NULL);
 
 	return (zio);
 }
 
 /*
  * This function is used to change the priority of an existing zio that is
  * currently in-flight. This is used by the arc to upgrade priority in the
  * event that a demand read is made for a block that is currently queued
  * as a scrub or async read IO. Otherwise, the high priority read request
  * would end up having to wait for the lower priority IO.
  */
 void
 zio_change_priority(zio_t *pio, zio_priority_t priority)
 {
 	zio_t *cio, *cio_next;
 	zio_link_t *zl = NULL;
 
 	ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 
 	if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
 		vdev_queue_change_io_priority(pio, priority);
 	} else {
 		pio->io_priority = priority;
 	}
 
 	mutex_enter(&pio->io_lock);
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
 		zio_change_priority(cio, priority);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 /*
  * For non-raidz ZIOs, we can just copy aside the bad data read from the
  * disk, and use that to finish the checksum ereport later.
  */
 static void
 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
     const abd_t *good_buf)
 {
 	/* no processing needed */
 	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
 }
 
 void
 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr)
 {
 	void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size);
 
 	abd_copy(abd, zio->io_abd, zio->io_size);
 
 	zcr->zcr_cbinfo = zio->io_size;
 	zcr->zcr_cbdata = abd;
 	zcr->zcr_finish = zio_vsd_default_cksum_finish;
 	zcr->zcr_free = zio_abd_free;
 }
 
 static zio_t *
 zio_vdev_io_assess(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
 
 	if (zio->io_vsd != NULL) {
 		zio->io_vsd_ops->vsd_free(zio);
 		zio->io_vsd = NULL;
 	}
 
 	if (zio_injection_enabled && zio->io_error == 0)
 		zio->io_error = zio_handle_fault_injection(zio, EIO);
 
 	/*
 	 * If the I/O failed, determine whether we should attempt to retry it.
 	 *
 	 * On retry, we cut in line in the issue queue, since we don't want
 	 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
 	 */
 	if (zio->io_error && vd == NULL &&
 	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
 		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
 		zio->io_error = 0;
 		zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE;
 		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
 		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
 		    zio_requeue_io_start_cut_in_line);
 		return (NULL);
 	}
 
 	/*
 	 * If we got an error on a leaf device, convert it to ENXIO
 	 * if the device is not accessible at all.
 	 */
 	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    !vdev_accessible(vd, zio))
 		zio->io_error = SET_ERROR(ENXIO);
 
 	/*
 	 * If we can't write to an interior vdev (mirror or RAID-Z),
 	 * set vdev_cant_write so that we stop trying to allocate from it.
 	 */
 	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
 	    vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
 		vdev_dbgmsg(vd, "zio_vdev_io_assess(zio=%px) setting "
 		    "cant_write=TRUE due to write failure with ENXIO",
 		    zio);
 		vd->vdev_cant_write = B_TRUE;
 	}
 
 	/*
 	 * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
 	 * attempts will ever succeed. In this case we set a persistent
 	 * boolean flag so that we don't bother with it in the future.
 	 */
 	if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
 	    zio->io_type == ZIO_TYPE_IOCTL &&
 	    zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
 		vd->vdev_nowritecache = B_TRUE;
 
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	return (zio);
 }
 
 void
 zio_vdev_io_reissue(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
 	zio->io_stage >>= 1;
 }
 
 void
 zio_vdev_io_redone(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
 
 	zio->io_stage >>= 1;
 }
 
 void
 zio_vdev_io_bypass(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
 	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
 }
 
 /*
  * ==========================================================================
  * Encrypt and store encryption parameters
  * ==========================================================================
  */
 
 
 /*
  * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for
  * managing the storage of encryption parameters and passing them to the
  * lower-level encryption functions.
  */
 static zio_t *
 zio_encrypt(zio_t *zio)
 {
 	zio_prop_t *zp = &zio->io_prop;
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize = BP_GET_PSIZE(bp);
 	uint64_t dsobj = zio->io_bookmark.zb_objset;
 	dmu_object_type_t ot = BP_GET_TYPE(bp);
 	void *enc_buf = NULL;
 	abd_t *eabd = NULL;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	/* the root zio already encrypted the data */
 	if (zio->io_child_type == ZIO_CHILD_GANG)
 		return (zio);
 
 	/* only ZIL blocks are re-encrypted on rewrite */
 	if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG)
 		return (zio);
 
 	if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) {
 		BP_SET_CRYPT(bp, B_FALSE);
 		return (zio);
 	}
 
 	/* if we are doing raw encryption set the provided encryption params */
 	if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) {
 		ASSERT0(BP_GET_LEVEL(bp));
 		BP_SET_CRYPT(bp, B_TRUE);
 		BP_SET_BYTEORDER(bp, zp->zp_byteorder);
 		if (ot != DMU_OT_OBJSET)
 			zio_crypt_encode_mac_bp(bp, zp->zp_mac);
 
 		/* dnode blocks must be written out in the provided byteorder */
 		if (zp->zp_byteorder != ZFS_HOST_BYTEORDER &&
 		    ot == DMU_OT_DNODE) {
 			void *bswap_buf = zio_buf_alloc(psize);
 			abd_t *babd = abd_get_from_buf(bswap_buf, psize);
 
 			ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 			abd_copy_to_buf(bswap_buf, zio->io_abd, psize);
 			dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf,
 			    psize);
 
 			abd_take_ownership_of_buf(babd, B_TRUE);
 			zio_push_transform(zio, babd, psize, psize, NULL);
 		}
 
 		if (DMU_OT_IS_ENCRYPTED(ot))
 			zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv);
 		return (zio);
 	}
 
 	/* indirect blocks only maintain a cksum of the lower level MACs */
 	if (BP_GET_LEVEL(bp) > 0) {
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE,
 		    zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp),
 		    mac));
 		zio_crypt_encode_mac_bp(bp, mac);
 		return (zio);
 	}
 
 	/*
 	 * Objset blocks are a special case since they have 2 256-bit MACs
 	 * embedded within them.
 	 */
 	if (ot == DMU_OT_OBJSET) {
 		ASSERT0(DMU_OT_IS_ENCRYPTED(ot));
 		ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj,
 		    zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp)));
 		return (zio);
 	}
 
 	/* unencrypted object types are only authenticated with a MAC */
 	if (!DMU_OT_IS_ENCRYPTED(ot)) {
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj,
 		    zio->io_abd, psize, mac));
 		zio_crypt_encode_mac_bp(bp, mac);
 		return (zio);
 	}
 
 	/*
 	 * Later passes of sync-to-convergence may decide to rewrite data
 	 * in place to avoid more disk reallocations. This presents a problem
 	 * for encryption because this constitutes rewriting the new data with
 	 * the same encryption key and IV. However, this only applies to blocks
 	 * in the MOS (particularly the spacemaps) and we do not encrypt the
 	 * MOS. We assert that the zio is allocating or an intent log write
 	 * to enforce this.
 	 */
 	ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG);
 	ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG);
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION));
 	ASSERT3U(psize, !=, 0);
 
 	enc_buf = zio_buf_alloc(psize);
 	eabd = abd_get_from_buf(enc_buf, psize);
 	abd_take_ownership_of_buf(eabd, B_TRUE);
 
 	/*
 	 * For an explanation of what encryption parameters are stored
 	 * where, see the block comment in zio_crypt.c.
 	 */
 	if (ot == DMU_OT_INTENT_LOG) {
 		zio_crypt_decode_params_bp(bp, salt, iv);
 	} else {
 		BP_SET_CRYPT(bp, B_TRUE);
 	}
 
 	/* Perform the encryption. This should not fail */
 	VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark,
 	    BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
 	    salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt));
 
 	/* encode encryption metadata into the bp */
 	if (ot == DMU_OT_INTENT_LOG) {
 		/*
 		 * ZIL blocks store the MAC in the embedded checksum, so the
 		 * transform must always be applied.
 		 */
 		zio_crypt_encode_mac_zil(enc_buf, mac);
 		zio_push_transform(zio, eabd, psize, psize, NULL);
 	} else {
 		BP_SET_CRYPT(bp, B_TRUE);
 		zio_crypt_encode_params_bp(bp, salt, iv);
 		zio_crypt_encode_mac_bp(bp, mac);
 
 		if (no_crypt) {
 			ASSERT3U(ot, ==, DMU_OT_DNODE);
 			abd_free(eabd);
 		} else {
 			zio_push_transform(zio, eabd, psize, psize, NULL);
 		}
 	}
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Generate and verify checksums
  * ==========================================================================
  */
 static zio_t *
 zio_checksum_generate(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	enum zio_checksum checksum;
 
 	if (bp == NULL) {
 		/*
 		 * This is zio_write_phys().
 		 * We're either generating a label checksum, or none at all.
 		 */
 		checksum = zio->io_prop.zp_checksum;
 
 		if (checksum == ZIO_CHECKSUM_OFF)
 			return (zio);
 
 		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
 	} else {
 		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
 			ASSERT(!IO_IS_ALLOCATING(zio));
 			checksum = ZIO_CHECKSUM_GANG_HEADER;
 		} else {
 			checksum = BP_GET_CHECKSUM(bp);
 		}
 	}
 
 	zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
 
 	return (zio);
 }
 
 static zio_t *
 zio_checksum_verify(zio_t *zio)
 {
 	zio_bad_cksum_t info;
 	blkptr_t *bp = zio->io_bp;
 	int error;
 
 	ASSERT(zio->io_vd != NULL);
 
 	if (bp == NULL) {
 		/*
 		 * This is zio_read_phys().
 		 * We're either verifying a label checksum, or nothing at all.
 		 */
 		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
 			return (zio);
 
 		ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
 	}
 
 	if ((error = zio_checksum_error(zio, &info)) != 0) {
 		zio->io_error = error;
 		if (error == ECKSUM &&
 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 			mutex_enter(&zio->io_vd->vdev_stat_lock);
 			zio->io_vd->vdev_stat.vs_checksum_errors++;
 			mutex_exit(&zio->io_vd->vdev_stat_lock);
 			(void) zfs_ereport_start_checksum(zio->io_spa,
 			    zio->io_vd, &zio->io_bookmark, zio,
 			    zio->io_offset, zio->io_size, &info);
 		}
 	}
 
 	return (zio);
 }
 
 /*
  * Called by RAID-Z to ensure we don't compute the checksum twice.
  */
 void
 zio_checksum_verified(zio_t *zio)
 {
 	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 }
 
 /*
  * ==========================================================================
  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
  * An error of 0 indicates success.  ENXIO indicates whole-device failure,
  * which may be transient (e.g. unplugged) or permanent.  ECKSUM and EIO
  * indicate errors that are specific to one I/O, and most likely permanent.
  * Any other error is presumed to be worse because we weren't expecting it.
  * ==========================================================================
  */
 int
 zio_worst_error(int e1, int e2)
 {
 	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
 	int r1, r2;
 
 	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
 		if (e1 == zio_error_rank[r1])
 			break;
 
 	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
 		if (e2 == zio_error_rank[r2])
 			break;
 
 	return (r1 > r2 ? e1 : e2);
 }
 
 /*
  * ==========================================================================
  * I/O completion
  * ==========================================================================
  */
 static zio_t *
 zio_ready(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	zio_t *pio, *pio_next;
 	zio_link_t *zl = NULL;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
 	    ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) {
 		return (NULL);
 	}
 
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
-		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
-		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
+		ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg ||
+		    BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE));
 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
 
 		zio->io_ready(zio);
 	}
 
 #ifdef ZFS_DEBUG
 	if (bp != NULL && bp != &zio->io_bp_copy)
 		zio->io_bp_copy = *bp;
 #endif
 
 	if (zio->io_error != 0) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 		if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(IO_IS_ALLOCATING(zio));
 			ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(zio->io_metaslab_class != NULL);
 			ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 			/*
 			 * We were unable to allocate anything, unreserve and
 			 * issue the next I/O to allocate.
 			 */
 			metaslab_class_throttle_unreserve(
 			    zio->io_metaslab_class, zio->io_prop.zp_copies,
 			    zio->io_allocator, zio);
 			zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
 		}
 	}
 
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_READY] = 1;
 	pio = zio_walk_parents(zio, &zl);
 	mutex_exit(&zio->io_lock);
 
 	/*
 	 * As we notify zio's parents, new parents could be added.
 	 * New parents go to the head of zio's io_parent_list, however,
 	 * so we will (correctly) not notify them.  The remainder of zio's
 	 * io_parent_list, from 'pio_next' onward, cannot change because
 	 * all parents must wait for us to be done before they can be done.
 	 */
 	for (; pio != NULL; pio = pio_next) {
 		pio_next = zio_walk_parents(zio, &zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL);
 	}
 
 	if (zio->io_flags & ZIO_FLAG_NODATA) {
 		if (bp != NULL && BP_IS_GANG(bp)) {
 			zio->io_flags &= ~ZIO_FLAG_NODATA;
 		} else {
 			ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 		}
 	}
 
 	if (zio_injection_enabled &&
 	    zio->io_spa->spa_syncing_txg == zio->io_txg)
 		zio_handle_ignored_writes(zio);
 
 	return (zio);
 }
 
 /*
  * Update the allocation throttle accounting.
  */
 static void
 zio_dva_throttle_done(zio_t *zio)
 {
 	zio_t *lio __maybe_unused = zio->io_logical;
 	zio_t *pio = zio_unique_parent(zio);
 	vdev_t *vd = zio->io_vd;
 	int flags = METASLAB_ASYNC_ALLOC;
 
 	ASSERT3P(zio->io_bp, !=, NULL);
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 	ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 	ASSERT(vd != NULL);
 	ASSERT3P(vd, ==, vd->vdev_top);
 	ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY));
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
 	ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
 	ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
 	ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
 
 	/*
 	 * Parents of gang children can have two flavors -- ones that
 	 * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
 	 * and ones that allocated the constituent blocks. The allocation
 	 * throttle needs to know the allocating parent zio so we must find
 	 * it here.
 	 */
 	if (pio->io_child_type == ZIO_CHILD_GANG) {
 		/*
 		 * If our parent is a rewrite gang child then our grandparent
 		 * would have been the one that performed the allocation.
 		 */
 		if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
 			pio = zio_unique_parent(pio);
 		flags |= METASLAB_GANG_CHILD;
 	}
 
 	ASSERT(IO_IS_ALLOCATING(pio));
 	ASSERT(ZIO_HAS_ALLOCATOR(pio));
 	ASSERT3P(zio, !=, zio->io_logical);
 	ASSERT(zio->io_logical != NULL);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
 	ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
 	ASSERT(zio->io_metaslab_class != NULL);
 
 	mutex_enter(&pio->io_lock);
 	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
 	    pio->io_allocator, B_TRUE);
 	mutex_exit(&pio->io_lock);
 
 	metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1,
 	    pio->io_allocator, pio);
 
 	/*
 	 * Call into the pipeline to see if there is more work that
 	 * needs to be done. If there is work to be done it will be
 	 * dispatched to another taskq thread.
 	 */
 	zio_allocate_dispatch(zio->io_spa, pio->io_allocator);
 }
 
 static zio_t *
 zio_done(zio_t *zio)
 {
 	/*
 	 * Always attempt to keep stack usage minimal here since
 	 * we can be called recursively up to 19 levels deep.
 	 */
 	const uint64_t psize = zio->io_size;
 	zio_t *pio, *pio_next;
 	zio_link_t *zl = NULL;
 
 	/*
 	 * If our children haven't all completed,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	/*
 	 * If the allocation throttle is enabled, then update the accounting.
 	 * We only track child I/Os that are part of an allocating async
 	 * write. We must do this since the allocation is performed
 	 * by the logical I/O but the actual write is done by child I/Os.
 	 */
 	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
 	    zio->io_child_type == ZIO_CHILD_VDEV) {
 		ASSERT(zio->io_metaslab_class != NULL);
 		ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
 		zio_dva_throttle_done(zio);
 	}
 
 	/*
 	 * If the allocation throttle is enabled, verify that
 	 * we have decremented the refcounts for every I/O that was throttled.
 	 */
 	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(zio->io_bp != NULL);
 		ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 		metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
 		    zio->io_allocator);
 		VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class->
 		    mc_allocator[zio->io_allocator].mca_alloc_slots, zio));
 	}
 
 
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 			ASSERT(zio->io_children[c][w] == 0);
 
 	if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
 		ASSERT(zio->io_bp->blk_pad[0] == 0);
 		ASSERT(zio->io_bp->blk_pad[1] == 0);
 		ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy,
 		    sizeof (blkptr_t)) == 0 ||
 		    (zio->io_bp == zio_unique_parent(zio)->io_bp));
 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
 		    zio->io_bp_override == NULL &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
 			ASSERT3U(zio->io_prop.zp_copies, <=,
 			    BP_GET_NDVAS(zio->io_bp));
 			ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
 			    (BP_COUNT_GANG(zio->io_bp) ==
 			    BP_GET_NDVAS(zio->io_bp)));
 		}
 		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
 			VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
 	}
 
 	/*
 	 * If there were child vdev/gang/ddt errors, they apply to us now.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
 	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
 	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
 
 	/*
 	 * If the I/O on the transformed data was successful, generate any
 	 * checksum reports now while we still have the transformed data.
 	 */
 	if (zio->io_error == 0) {
 		while (zio->io_cksum_report != NULL) {
 			zio_cksum_report_t *zcr = zio->io_cksum_report;
 			uint64_t align = zcr->zcr_align;
 			uint64_t asize = P2ROUNDUP(psize, align);
 			abd_t *adata = zio->io_abd;
 
 			if (adata != NULL && asize != psize) {
 				adata = abd_alloc(asize, B_TRUE);
 				abd_copy(adata, zio->io_abd, psize);
 				abd_zero_off(adata, psize, asize - psize);
 			}
 
 			zio->io_cksum_report = zcr->zcr_next;
 			zcr->zcr_next = NULL;
 			zcr->zcr_finish(zcr, adata);
 			zfs_ereport_free_checksum(zcr);
 
 			if (adata != NULL && asize != psize)
 				abd_free(adata);
 		}
 	}
 
 	zio_pop_transforms(zio);	/* note: may set zio->io_error */
 
 	vdev_stat_update(zio, psize);
 
 	/*
 	 * If this I/O is attached to a particular vdev is slow, exceeding
 	 * 30 seconds to complete, post an error described the I/O delay.
 	 * We ignore these errors if the device is currently unavailable.
 	 */
 	if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) {
 		if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) {
 			/*
 			 * We want to only increment our slow IO counters if
 			 * the IO is valid (i.e. not if the drive is removed).
 			 *
 			 * zfs_ereport_post() will also do these checks, but
 			 * it can also ratelimit and have other failures, so we
 			 * need to increment the slow_io counters independent
 			 * of it.
 			 */
 			if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY,
 			    zio->io_spa, zio->io_vd, zio)) {
 				mutex_enter(&zio->io_vd->vdev_stat_lock);
 				zio->io_vd->vdev_stat.vs_slow_ios++;
 				mutex_exit(&zio->io_vd->vdev_stat_lock);
 
 				(void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
 				    zio->io_spa, zio->io_vd, &zio->io_bookmark,
 				    zio, 0);
 			}
 		}
 	}
 
 	if (zio->io_error) {
 		/*
 		 * If this I/O is attached to a particular vdev,
 		 * generate an error message describing the I/O failure
 		 * at the block level.  We ignore these errors if the
 		 * device is currently unavailable.
 		 */
 		if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
 		    !vdev_is_dead(zio->io_vd)) {
 			int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
 			    zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
 			if (ret != EALREADY) {
 				mutex_enter(&zio->io_vd->vdev_stat_lock);
 				if (zio->io_type == ZIO_TYPE_READ)
 					zio->io_vd->vdev_stat.vs_read_errors++;
 				else if (zio->io_type == ZIO_TYPE_WRITE)
 					zio->io_vd->vdev_stat.vs_write_errors++;
 				mutex_exit(&zio->io_vd->vdev_stat_lock);
 			}
 		}
 
 		if ((zio->io_error == EIO || !(zio->io_flags &
 		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
 		    zio == zio->io_logical) {
 			/*
 			 * For logical I/O requests, tell the SPA to log the
 			 * error and generate a logical data ereport.
 			 */
 			spa_log_error(zio->io_spa, &zio->io_bookmark,
-			    &zio->io_bp->blk_birth);
+			    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
 			    zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
 		}
 	}
 
 	if (zio->io_error && zio == zio->io_logical) {
 		/*
 		 * Determine whether zio should be reexecuted.  This will
 		 * propagate all the way to the root via zio_notify_parent().
 		 */
 		ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 		if (IO_IS_ALLOCATING(zio) &&
 		    !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
 			if (zio->io_error != ENOSPC)
 				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
 			else
 				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 		}
 
 		if ((zio->io_type == ZIO_TYPE_READ ||
 		    zio->io_type == ZIO_TYPE_FREE) &&
 		    !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
 		    zio->io_error == ENXIO &&
 		    spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
 		    spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
 		/*
 		 * Here is a possibly good place to attempt to do
 		 * either combinatorial reconstruction or error correction
 		 * based on checksums.  It also might be a good place
 		 * to send out preliminary ereports before we suspend
 		 * processing.
 		 */
 	}
 
 	/*
 	 * If there were logical child errors, they apply to us now.
 	 * We defer this until now to avoid conflating logical child
 	 * errors with errors that happened to the zio itself when
 	 * updating vdev stats and reporting FMA events above.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
 
 	if ((zio->io_error || zio->io_reexecute) &&
 	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
 	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
 		zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
 
 	zio_gang_tree_free(&zio->io_gang_tree);
 
 	/*
 	 * Godfather I/Os should never suspend.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
 	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
 		zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND;
 
 	if (zio->io_reexecute) {
 		/*
 		 * This is a logical I/O that wants to reexecute.
 		 *
 		 * Reexecute is top-down.  When an i/o fails, if it's not
 		 * the root, it simply notifies its parent and sticks around.
 		 * The parent, seeing that it still has children in zio_done(),
 		 * does the same.  This percolates all the way up to the root.
 		 * The root i/o will reexecute or suspend the entire tree.
 		 *
 		 * This approach ensures that zio_reexecute() honors
 		 * all the original i/o dependency relationships, e.g.
 		 * parents not executing until children are ready.
 		 */
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 		zio->io_gang_leader = NULL;
 
 		mutex_enter(&zio->io_lock);
 		zio->io_state[ZIO_WAIT_DONE] = 1;
 		mutex_exit(&zio->io_lock);
 
 		/*
 		 * "The Godfather" I/O monitors its children but is
 		 * not a true parent to them. It will track them through
 		 * the pipeline but severs its ties whenever they get into
 		 * trouble (e.g. suspended). This allows "The Godfather"
 		 * I/O to return status without blocking.
 		 */
 		zl = NULL;
 		for (pio = zio_walk_parents(zio, &zl); pio != NULL;
 		    pio = pio_next) {
 			zio_link_t *remove_zl = zl;
 			pio_next = zio_walk_parents(zio, &zl);
 
 			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
 			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
 				zio_remove_child(pio, zio, remove_zl);
 				/*
 				 * This is a rare code path, so we don't
 				 * bother with "next_to_execute".
 				 */
 				zio_notify_parent(pio, zio, ZIO_WAIT_DONE,
 				    NULL);
 			}
 		}
 
 		if ((pio = zio_unique_parent(zio)) != NULL) {
 			/*
 			 * We're not a root i/o, so there's nothing to do
 			 * but notify our parent.  Don't propagate errors
 			 * upward since we haven't permanently failed yet.
 			 */
 			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
 			/*
 			 * This is a rare code path, so we don't bother with
 			 * "next_to_execute".
 			 */
 			zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL);
 		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
 			/*
 			 * We'd fail again if we reexecuted now, so suspend
 			 * until conditions improve (e.g. device comes online).
 			 */
 			zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR);
 		} else {
 			/*
 			 * Reexecution is potentially a huge amount of work.
 			 * Hand it off to the otherwise-unused claim taskq.
 			 */
 			ASSERT(taskq_empty_ent(&zio->io_tqent));
 			spa_taskq_dispatch_ent(zio->io_spa,
 			    ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
 			    zio_reexecute, zio, 0, &zio->io_tqent, NULL);
 		}
 		return (NULL);
 	}
 
 	ASSERT(list_is_empty(&zio->io_child_list));
 	ASSERT(zio->io_reexecute == 0);
 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 
 	/*
 	 * Report any checksum errors, since the I/O is complete.
 	 */
 	while (zio->io_cksum_report != NULL) {
 		zio_cksum_report_t *zcr = zio->io_cksum_report;
 		zio->io_cksum_report = zcr->zcr_next;
 		zcr->zcr_next = NULL;
 		zcr->zcr_finish(zcr, NULL);
 		zfs_ereport_free_checksum(zcr);
 	}
 
 	/*
 	 * It is the responsibility of the done callback to ensure that this
 	 * particular zio is no longer discoverable for adoption, and as
 	 * such, cannot acquire any new parents.
 	 */
 	if (zio->io_done)
 		zio->io_done(zio);
 
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_DONE] = 1;
 	mutex_exit(&zio->io_lock);
 
 	/*
 	 * We are done executing this zio.  We may want to execute a parent
 	 * next.  See the comment in zio_notify_parent().
 	 */
 	zio_t *next_to_execute = NULL;
 	zl = NULL;
 	for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
 		zio_link_t *remove_zl = zl;
 		pio_next = zio_walk_parents(zio, &zl);
 		zio_remove_child(pio, zio, remove_zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute);
 	}
 
 	if (zio->io_waiter != NULL) {
 		mutex_enter(&zio->io_lock);
 		zio->io_executor = NULL;
 		cv_broadcast(&zio->io_cv);
 		mutex_exit(&zio->io_lock);
 	} else {
 		zio_destroy(zio);
 	}
 
 	return (next_to_execute);
 }
 
 /*
  * ==========================================================================
  * I/O pipeline definition
  * ==========================================================================
  */
 static zio_pipe_stage_t *zio_pipeline[] = {
 	NULL,
 	zio_read_bp_init,
 	zio_write_bp_init,
 	zio_free_bp_init,
 	zio_issue_async,
 	zio_write_compress,
 	zio_encrypt,
 	zio_checksum_generate,
 	zio_nop_write,
 	zio_brt_free,
 	zio_ddt_read_start,
 	zio_ddt_read_done,
 	zio_ddt_write,
 	zio_ddt_free,
 	zio_gang_assemble,
 	zio_gang_issue,
 	zio_dva_throttle,
 	zio_dva_allocate,
 	zio_dva_free,
 	zio_dva_claim,
 	zio_ready,
 	zio_vdev_io_start,
 	zio_vdev_io_done,
 	zio_vdev_io_assess,
 	zio_checksum_verify,
 	zio_done
 };
 
 
 
 
 /*
  * Compare two zbookmark_phys_t's to see which we would reach first in a
  * pre-order traversal of the object tree.
  *
  * This is simple in every case aside from the meta-dnode object. For all other
  * objects, we traverse them in order (object 1 before object 2, and so on).
  * However, all of these objects are traversed while traversing object 0, since
  * the data it points to is the list of objects.  Thus, we need to convert to a
  * canonical representation so we can compare meta-dnode bookmarks to
  * non-meta-dnode bookmarks.
  *
  * We do this by calculating "equivalents" for each field of the zbookmark.
  * zbookmarks outside of the meta-dnode use their own object and level, and
  * calculate the level 0 equivalent (the first L0 blkid that is contained in the
  * blocks this bookmark refers to) by multiplying their blkid by their span
  * (the number of L0 blocks contained within one block at their level).
  * zbookmarks inside the meta-dnode calculate their object equivalent
  * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
  * level + 1<<31 (any value larger than a level could ever be) for their level.
  * This causes them to always compare before a bookmark in their object
  * equivalent, compare appropriately to bookmarks in other objects, and to
  * compare appropriately to other bookmarks in the meta-dnode.
  */
 int
 zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
     const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
 {
 	/*
 	 * These variables represent the "equivalent" values for the zbookmark,
 	 * after converting zbookmarks inside the meta dnode to their
 	 * normal-object equivalents.
 	 */
 	uint64_t zb1obj, zb2obj;
 	uint64_t zb1L0, zb2L0;
 	uint64_t zb1level, zb2level;
 
 	if (zb1->zb_object == zb2->zb_object &&
 	    zb1->zb_level == zb2->zb_level &&
 	    zb1->zb_blkid == zb2->zb_blkid)
 		return (0);
 
 	IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT);
 	IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT);
 
 	/*
 	 * BP_SPANB calculates the span in blocks.
 	 */
 	zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
 	zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
 
 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
 		zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
 		zb1L0 = 0;
 		zb1level = zb1->zb_level + COMPARE_META_LEVEL;
 	} else {
 		zb1obj = zb1->zb_object;
 		zb1level = zb1->zb_level;
 	}
 
 	if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
 		zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
 		zb2L0 = 0;
 		zb2level = zb2->zb_level + COMPARE_META_LEVEL;
 	} else {
 		zb2obj = zb2->zb_object;
 		zb2level = zb2->zb_level;
 	}
 
 	/* Now that we have a canonical representation, do the comparison. */
 	if (zb1obj != zb2obj)
 		return (zb1obj < zb2obj ? -1 : 1);
 	else if (zb1L0 != zb2L0)
 		return (zb1L0 < zb2L0 ? -1 : 1);
 	else if (zb1level != zb2level)
 		return (zb1level > zb2level ? -1 : 1);
 	/*
 	 * This can (theoretically) happen if the bookmarks have the same object
 	 * and level, but different blkids, if the block sizes are not the same.
 	 * There is presently no way to change the indirect block sizes
 	 */
 	return (0);
 }
 
 /*
  *  This function checks the following: given that last_block is the place that
  *  our traversal stopped last time, does that guarantee that we've visited
  *  every node under subtree_root?  Therefore, we can't just use the raw output
  *  of zbookmark_compare.  We have to pass in a modified version of
  *  subtree_root; by incrementing the block id, and then checking whether
  *  last_block is before or equal to that, we can tell whether or not having
  *  visited last_block implies that all of subtree_root's children have been
  *  visited.
  */
 boolean_t
 zbookmark_subtree_completed(const dnode_phys_t *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
 {
 	zbookmark_phys_t mod_zb = *subtree_root;
 	mod_zb.zb_blkid++;
 	ASSERT0(last_block->zb_level);
 
 	/* The objset_phys_t isn't before anything. */
 	if (dnp == NULL)
 		return (B_FALSE);
 
 	/*
 	 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
 	 * data block size in sectors, because that variable is only used if
 	 * the bookmark refers to a block in the meta-dnode.  Since we don't
 	 * know without examining it what object it refers to, and there's no
 	 * harm in passing in this value in other cases, we always pass it in.
 	 *
 	 * We pass in 0 for the indirect block size shift because zb2 must be
 	 * level 0.  The indirect block size is only used to calculate the span
 	 * of the bookmark, but since the bookmark must be level 0, the span is
 	 * always 1, so the math works out.
 	 *
 	 * If you make changes to how the zbookmark_compare code works, be sure
 	 * to make sure that this code still works afterwards.
 	 */
 	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
 	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
 	    last_block) <= 0);
 }
 
 /*
  * This function is similar to zbookmark_subtree_completed(), but returns true
  * if subtree_root is equal or ahead of last_block, i.e. still to be done.
  */
 boolean_t
 zbookmark_subtree_tbd(const dnode_phys_t *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
 {
 	ASSERT0(last_block->zb_level);
 	if (dnp == NULL)
 		return (B_FALSE);
 	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
 	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, subtree_root,
 	    last_block) >= 0);
 }
 
 EXPORT_SYMBOL(zio_type_name);
 EXPORT_SYMBOL(zio_buf_alloc);
 EXPORT_SYMBOL(zio_data_buf_alloc);
 EXPORT_SYMBOL(zio_buf_free);
 EXPORT_SYMBOL(zio_data_buf_free);
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW,
 	"Max I/O completion time (milliseconds) before marking it as slow");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW,
 	"Prioritize requeued I/O");
 
 ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free,  UINT, ZMOD_RW,
 	"Defer frees starting in this pass");
 
 ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, UINT, ZMOD_RW,
 	"Don't compress starting in this pass");
 
 ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, UINT, ZMOD_RW,
 	"Rewrite new bps starting in this pass");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW,
 	"Throttle block allocations in the ZIO pipeline");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW,
 	"Log all slow ZIOs, not just those with vdevs");
diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c
index e511b31fee6d..ce6772a40c8b 100644
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@@ -1,577 +1,577 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/zil.h>
 #include <sys/abd.h>
 #include <zfs_fletcher.h>
 
 /*
  * Checksum vectors.
  *
  * In the SPA, everything is checksummed.  We support checksum vectors
  * for three distinct reasons:
  *
  *   1. Different kinds of data need different levels of protection.
  *	For SPA metadata, we always want a very strong checksum.
  *	For user data, we let users make the trade-off between speed
  *	and checksum strength.
  *
  *   2. Cryptographic hash and MAC algorithms are an area of active research.
  *	It is likely that in future hash functions will be at least as strong
  *	as current best-of-breed, and may be substantially faster as well.
  *	We want the ability to take advantage of these new hashes as soon as
  *	they become available.
  *
  *   3. If someone develops hardware that can compute a strong hash quickly,
  *	we want the ability to take advantage of that hardware.
  *
  * Of course, we don't want a checksum upgrade to invalidate existing
  * data, so we store the checksum *function* in eight bits of the bp.
  * This gives us room for up to 256 different checksum functions.
  *
  * When writing a block, we always checksum it with the latest-and-greatest
  * checksum function of the appropriate strength.  When reading a block,
  * we compare the expected checksum against the actual checksum, which we
  * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
  *
  * SALTED CHECKSUMS
  *
  * To enable the use of less secure hash algorithms with dedup, we
  * introduce the notion of salted checksums (MACs, really).  A salted
  * checksum is fed both a random 256-bit value (the salt) and the data
  * to be checksummed.  This salt is kept secret (stored on the pool, but
  * never shown to the user).  Thus even if an attacker knew of collision
  * weaknesses in the hash algorithm, they won't be able to mount a known
  * plaintext attack on the DDT, since the actual hash value cannot be
  * known ahead of time.  How the salt is used is algorithm-specific
  * (some might simply prefix it to the data block, others might need to
  * utilize a full-blown HMAC).  On disk the salt is stored in a ZAP
  * object in the MOS (DMU_POOL_CHECKSUM_SALT).
  *
  * CONTEXT TEMPLATES
  *
  * Some hashing algorithms need to perform a substantial amount of
  * initialization work (e.g. salted checksums above may need to pre-hash
  * the salt) before being able to process data.  Performing this
  * redundant work for each block would be wasteful, so we instead allow
  * a checksum algorithm to do the work once (the first time it's used)
  * and then keep this pre-initialized context as a template inside the
  * spa_t (spa_cksum_tmpls).  If the zio_checksum_info_t contains
  * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to
  * construct and destruct the pre-initialized checksum context.  The
  * pre-initialized context is then reused during each checksum
  * invocation and passed to the checksum function.
  */
 
 static void
 abd_checksum_off(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	(void) abd, (void) size, (void) ctx_template;
 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 }
 
 static void
 abd_fletcher_2_native(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	(void) ctx_template;
 	fletcher_init(zcp);
 	(void) abd_iterate_func(abd, 0, size,
 	    fletcher_2_incremental_native, zcp);
 }
 
 static void
 abd_fletcher_2_byteswap(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	(void) ctx_template;
 	fletcher_init(zcp);
 	(void) abd_iterate_func(abd, 0, size,
 	    fletcher_2_incremental_byteswap, zcp);
 }
 
 static inline void
 abd_fletcher_4_impl(abd_t *abd, uint64_t size, zio_abd_checksum_data_t *acdp)
 {
 	fletcher_4_abd_ops.acf_init(acdp);
 	abd_iterate_func(abd, 0, size, fletcher_4_abd_ops.acf_iter, acdp);
 	fletcher_4_abd_ops.acf_fini(acdp);
 }
 
 void
 abd_fletcher_4_native(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	(void) ctx_template;
 	fletcher_4_ctx_t ctx;
 
 	zio_abd_checksum_data_t acd = {
 		.acd_byteorder	= ZIO_CHECKSUM_NATIVE,
 		.acd_zcp 	= zcp,
 		.acd_ctx	= &ctx
 	};
 
 	abd_fletcher_4_impl(abd, size, &acd);
 
 }
 
 void
 abd_fletcher_4_byteswap(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	(void) ctx_template;
 	fletcher_4_ctx_t ctx;
 
 	zio_abd_checksum_data_t acd = {
 		.acd_byteorder	= ZIO_CHECKSUM_BYTESWAP,
 		.acd_zcp 	= zcp,
 		.acd_ctx	= &ctx
 	};
 
 	abd_fletcher_4_impl(abd, size, &acd);
 }
 
 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
 	{{NULL, NULL}, NULL, NULL, 0, "on"},
 	{{abd_checksum_off,		abd_checksum_off},
 	    NULL, NULL, 0, "off"},
 	{{abd_checksum_sha256,		abd_checksum_sha256},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
 	    "label"},
 	{{abd_checksum_sha256,		abd_checksum_sha256},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
 	    "gang_header"},
 	{{abd_fletcher_2_native,	abd_fletcher_2_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
 	{{abd_fletcher_2_native,	abd_fletcher_2_byteswap},
 	    NULL, NULL, 0, "fletcher2"},
 	{{abd_fletcher_4_native,	abd_fletcher_4_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
 	{{abd_checksum_sha256,		abd_checksum_sha256},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 	    ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
 	{{abd_fletcher_4_native,	abd_fletcher_4_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
 	{{abd_checksum_off,		abd_checksum_off},
 	    NULL, NULL, 0, "noparity"},
 	{{abd_checksum_sha512_native,	abd_checksum_sha512_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 	    ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
 	{{abd_checksum_skein_native,	abd_checksum_skein_byteswap},
 	    abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free,
 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
 	{{abd_checksum_edonr_native,	abd_checksum_edonr_byteswap},
 	    abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
 	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
 	{{abd_checksum_blake3_native,	abd_checksum_blake3_byteswap},
 	    abd_checksum_blake3_tmpl_init, abd_checksum_blake3_tmpl_free,
 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"},
 };
 
 /*
  * The flag corresponding to the "verify" in dedup=[checksum,]verify
  * must be cleared first, so callers should use ZIO_CHECKSUM_MASK.
  */
 spa_feature_t
 zio_checksum_to_feature(enum zio_checksum cksum)
 {
 	VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
 
 	switch (cksum) {
 	case ZIO_CHECKSUM_BLAKE3:
 		return (SPA_FEATURE_BLAKE3);
 	case ZIO_CHECKSUM_SHA512:
 		return (SPA_FEATURE_SHA512);
 	case ZIO_CHECKSUM_SKEIN:
 		return (SPA_FEATURE_SKEIN);
 	case ZIO_CHECKSUM_EDONR:
 		return (SPA_FEATURE_EDONR);
 	default:
 		return (SPA_FEATURE_NONE);
 	}
 }
 
 enum zio_checksum
 zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
 {
 	ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
 
 	if (child == ZIO_CHECKSUM_INHERIT)
 		return (parent);
 
 	if (child == ZIO_CHECKSUM_ON)
 		return (ZIO_CHECKSUM_ON_VALUE);
 
 	return (child);
 }
 
 enum zio_checksum
 zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
     enum zio_checksum parent)
 {
 	ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
 
 	if (child == ZIO_CHECKSUM_INHERIT)
 		return (parent);
 
 	if (child == ZIO_CHECKSUM_ON)
 		return (spa_dedup_checksum(spa));
 
 	if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
 		return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
 
 	ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags &
 	    ZCHECKSUM_FLAG_DEDUP) ||
 	    (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
 
 	return (child);
 }
 
 /*
  * Set the external verifier for a gang block based on <vdev, offset, txg>,
  * a tuple which is guaranteed to be unique for the life of the pool.
  */
 static void
 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
-	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
+	uint64_t txg = BP_GET_BIRTH(bp);
 
 	ASSERT(BP_IS_GANG(bp));
 
 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
 }
 
 /*
  * Set the external verifier for a label block based on its offset.
  * The vdev is implicit, and the txg is unknowable at pool open time --
  * hence the logic in vdev_uberblock_load() to find the most recent copy.
  */
 static void
 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
 {
 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
 }
 
 /*
  * Calls the template init function of a checksum which supports context
  * templates and installs the template into the spa_t.
  */
 static void
 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
 {
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 
 	if (ci->ci_tmpl_init == NULL)
 		return;
 	if (spa->spa_cksum_tmpls[checksum] != NULL)
 		return;
 
 	VERIFY(ci->ci_tmpl_free != NULL);
 	mutex_enter(&spa->spa_cksum_tmpls_lock);
 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
 		spa->spa_cksum_tmpls[checksum] =
 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
 		VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
 	}
 	mutex_exit(&spa->spa_cksum_tmpls_lock);
 }
 
 /* convenience function to update a checksum to accommodate an encryption MAC */
 static void
 zio_checksum_handle_crypt(zio_cksum_t *cksum, zio_cksum_t *saved, boolean_t xor)
 {
 	/*
 	 * Weak checksums do not have their entropy spread evenly
 	 * across the bits of the checksum. Therefore, when truncating
 	 * a weak checksum we XOR the first 2 words with the last 2 so
 	 * that we don't "lose" any entropy unnecessarily.
 	 */
 	if (xor) {
 		cksum->zc_word[0] ^= cksum->zc_word[2];
 		cksum->zc_word[1] ^= cksum->zc_word[3];
 	}
 
 	cksum->zc_word[2] = saved->zc_word[2];
 	cksum->zc_word[3] = saved->zc_word[3];
 }
 
 /*
  * Generate the checksum.
  */
 void
 zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
     abd_t *abd, uint64_t size)
 {
 	static const uint64_t zec_magic = ZEC_MAGIC;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t offset = zio->io_offset;
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 	zio_cksum_t cksum, saved;
 	spa_t *spa = zio->io_spa;
 	boolean_t insecure = (ci->ci_flags & ZCHECKSUM_FLAG_DEDUP) == 0;
 
 	ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(ci->ci_func[0] != NULL);
 
 	zio_checksum_template_init(checksum, spa);
 
 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		zio_eck_t eck;
 		size_t eck_offset;
 
 		memset(&saved, 0, sizeof (zio_cksum_t));
 
 		if (checksum == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t zilc;
 			abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));
 
 			uint64_t nused = P2ROUNDUP_TYPED(zilc.zc_nused,
 			    ZIL_MIN_BLKSZ, uint64_t);
 			ASSERT3U(size, >=, nused);
 			size = nused;
 			eck = zilc.zc_eck;
 			eck_offset = offsetof(zil_chain_t, zc_eck);
 		} else {
 			ASSERT3U(size, >=, sizeof (zio_eck_t));
 			eck_offset = size - sizeof (zio_eck_t);
 			abd_copy_to_buf_off(&eck, abd, eck_offset,
 			    sizeof (zio_eck_t));
 		}
 
 		if (checksum == ZIO_CHECKSUM_GANG_HEADER) {
 			zio_checksum_gang_verifier(&eck.zec_cksum, bp);
 		} else if (checksum == ZIO_CHECKSUM_LABEL) {
 			zio_checksum_label_verifier(&eck.zec_cksum, offset);
 		} else {
 			saved = eck.zec_cksum;
 			eck.zec_cksum = bp->blk_cksum;
 		}
 
 		abd_copy_from_buf_off(abd, &zec_magic,
 		    eck_offset + offsetof(zio_eck_t, zec_magic),
 		    sizeof (zec_magic));
 		abd_copy_from_buf_off(abd, &eck.zec_cksum,
 		    eck_offset + offsetof(zio_eck_t, zec_cksum),
 		    sizeof (zio_cksum_t));
 
 		ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
 		    &cksum);
 		if (bp != NULL && BP_USES_CRYPT(bp) &&
 		    BP_GET_TYPE(bp) != DMU_OT_OBJSET)
 			zio_checksum_handle_crypt(&cksum, &saved, insecure);
 
 		abd_copy_from_buf_off(abd, &cksum,
 		    eck_offset + offsetof(zio_eck_t, zec_cksum),
 		    sizeof (zio_cksum_t));
 	} else {
 		saved = bp->blk_cksum;
 		ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
 		    &cksum);
 		if (BP_USES_CRYPT(bp) && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
 			zio_checksum_handle_crypt(&cksum, &saved, insecure);
 		bp->blk_cksum = cksum;
 	}
 }
 
 int
 zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
     enum zio_checksum checksum, abd_t *abd, uint64_t size, uint64_t offset,
     zio_bad_cksum_t *info)
 {
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 	zio_cksum_t actual_cksum, expected_cksum;
 	zio_eck_t eck;
 	int byteswap;
 
 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
 		return (SET_ERROR(EINVAL));
 
 	zio_checksum_template_init(checksum, spa);
 
 	IMPLY(bp == NULL, ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED);
 	IMPLY(bp == NULL, checksum == ZIO_CHECKSUM_LABEL);
 
 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		zio_cksum_t verifier;
 		size_t eck_offset;
 
 		if (checksum == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t zilc;
 			uint64_t nused;
 
 			abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));
 
 			eck = zilc.zc_eck;
 			eck_offset = offsetof(zil_chain_t, zc_eck) +
 			    offsetof(zio_eck_t, zec_cksum);
 
 			if (eck.zec_magic == ZEC_MAGIC) {
 				nused = zilc.zc_nused;
 			} else if (eck.zec_magic == BSWAP_64(ZEC_MAGIC)) {
 				nused = BSWAP_64(zilc.zc_nused);
 			} else {
 				return (SET_ERROR(ECKSUM));
 			}
 
 			nused = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
 			if (size < nused)
 				return (SET_ERROR(ECKSUM));
 			size = nused;
 		} else {
 			if (size < sizeof (zio_eck_t))
 				return (SET_ERROR(ECKSUM));
 			eck_offset = size - sizeof (zio_eck_t);
 			abd_copy_to_buf_off(&eck, abd, eck_offset,
 			    sizeof (zio_eck_t));
 			eck_offset += offsetof(zio_eck_t, zec_cksum);
 		}
 
 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
 			zio_checksum_gang_verifier(&verifier, bp);
 		else if (checksum == ZIO_CHECKSUM_LABEL)
 			zio_checksum_label_verifier(&verifier, offset);
 		else
 			verifier = bp->blk_cksum;
 
 		byteswap = (eck.zec_magic == BSWAP_64(ZEC_MAGIC));
 
 		if (byteswap)
 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 
 		expected_cksum = eck.zec_cksum;
 
 		abd_copy_from_buf_off(abd, &verifier, eck_offset,
 		    sizeof (zio_cksum_t));
 
 		ci->ci_func[byteswap](abd, size,
 		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
 
 		abd_copy_from_buf_off(abd, &expected_cksum, eck_offset,
 		    sizeof (zio_cksum_t));
 
 		if (byteswap) {
 			byteswap_uint64_array(&expected_cksum,
 			    sizeof (zio_cksum_t));
 		}
 	} else {
 		byteswap = BP_SHOULD_BYTESWAP(bp);
 		expected_cksum = bp->blk_cksum;
 		ci->ci_func[byteswap](abd, size,
 		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
 	}
 
 	/*
 	 * MAC checksums are a special case since half of this checksum will
 	 * actually be the encryption MAC. This will be verified by the
 	 * decryption process, so we just check the truncated checksum now.
 	 * Objset blocks use embedded MACs so we don't truncate the checksum
 	 * for them.
 	 */
 	if (bp != NULL && BP_USES_CRYPT(bp) &&
 	    BP_GET_TYPE(bp) != DMU_OT_OBJSET) {
 		if (!(ci->ci_flags & ZCHECKSUM_FLAG_DEDUP)) {
 			actual_cksum.zc_word[0] ^= actual_cksum.zc_word[2];
 			actual_cksum.zc_word[1] ^= actual_cksum.zc_word[3];
 		}
 
 		actual_cksum.zc_word[2] = 0;
 		actual_cksum.zc_word[3] = 0;
 		expected_cksum.zc_word[2] = 0;
 		expected_cksum.zc_word[3] = 0;
 	}
 
 	if (info != NULL) {
 		info->zbc_checksum_name = ci->ci_name;
 		info->zbc_byteswapped = byteswap;
 		info->zbc_injected = 0;
 		info->zbc_has_cksum = 1;
 	}
 
 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
 		return (SET_ERROR(ECKSUM));
 
 	return (0);
 }
 
 int
 zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
 {
 	blkptr_t *bp = zio->io_bp;
 	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
 	int error;
 	uint64_t size = (bp == NULL ? zio->io_size :
 	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
 	uint64_t offset = zio->io_offset;
 	abd_t *data = zio->io_abd;
 	spa_t *spa = zio->io_spa;
 
 	error = zio_checksum_error_impl(spa, bp, checksum, data, size,
 	    offset, info);
 
 	if (zio_injection_enabled && error == 0 && zio->io_error == 0) {
 		error = zio_handle_fault_injection(zio, ECKSUM);
 		if (error != 0)
 			info->zbc_injected = 1;
 	}
 
 	return (error);
 }
 
 /*
  * Called by a spa_t that's about to be deallocated. This steps through
  * all of the checksum context templates and deallocates any that were
  * initialized using the algorithm-specific template init function.
  */
 void
 zio_checksum_templates_free(spa_t *spa)
 {
 	for (enum zio_checksum checksum = 0;
 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 
 			VERIFY(ci->ci_tmpl_free != NULL);
 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
 			spa->spa_cksum_tmpls[checksum] = NULL;
 		}
 	}
 }